diff --git a/.clang-format b/.clang-format
new file mode 100644
index 000000000..2b00788b1
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,90 @@
+BasedOnStyle: LLVM
+AccessModifierOffset: -2
+AlignAfterOpenBracket: Align
+AlignConsecutiveAssignments: true
+AlignEscapedNewlines: Right
+AlignOperands: true
+AllowAllArgumentsOnNextLine: false
+AllowAllConstructorInitializersOnNextLine: false
+AllowAllParametersOfDeclarationOnNextLine: false
+AllowShortBlocksOnASingleLine: false
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: All
+AllowShortIfStatementsOnASingleLine: Never
+AllowShortLambdasOnASingleLine: All
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterReturnType: None
+AlwaysBreakTemplateDeclarations: Yes
+BinPackArguments: false
+BinPackParameters: false
+BreakBeforeBraces: Custom
+BraceWrapping:
+  AfterCaseLabel: false
+  AfterClass: true
+  AfterControlStatement: true
+  AfterEnum: true
+  AfterFunction: true
+  AfterNamespace: true
+  AfterStruct: true
+  AfterUnion: true
+  BeforeCatch: true
+  BeforeElse: true
+  IndentBraces: false
+  SplitEmptyFunction: false
+  SplitEmptyRecord: false
+BreakBeforeBinaryOperators: None
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializers: BeforeComma
+BreakInheritanceList: BeforeComma
+ColumnLimit: 100
+CompactNamespaces: false
+ContinuationIndentWidth: 2
+IncludeBlocks:   Regroup
+IncludeCategories:
+  - Regex:           '^<cub'
+    Priority:        1
+  - Regex:           '^<thrust'
+    Priority:        2
+  - Regex:           '^<cuda'
+    Priority:        3
+  - Regex:           '^<[a-z]*>$'
+    Priority:        4
+  - Regex:           '^<unittest'
+    Priority:        5    
+  - Regex:           '.*'
+    Priority:        6
+IndentCaseLabels: true
+IndentPPDirectives: None
+IndentWidth: 2
+KeepEmptyLinesAtTheStartOfBlocks: true
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+PenaltyBreakAssignment: 30
+PenaltyBreakBeforeFirstCallParameter: 50
+PenaltyBreakComment: 0
+PenaltyBreakFirstLessLess: 0
+PenaltyBreakString: 70
+PenaltyBreakTemplateDeclaration: 0
+PenaltyExcessCharacter: 100
+PenaltyReturnTypeOnItsOwnLine: 90
+PointerAlignment: Right
+ReflowComments: true
+SortIncludes: CaseInsensitive
+SpaceAfterCStyleCast: false
+SpaceAfterLogicalNot: false
+SpaceAfterTemplateKeyword: true
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeCpp11BracedList: false
+SpaceBeforeCtorInitializerColon: true
+SpaceBeforeInheritanceColon: true
+SpaceBeforeParens: ControlStatements
+SpaceBeforeRangeBasedForLoopColon: true
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles: false
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard: c++11
+TabWidth: 2
+UseTab: Never
diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs
new file mode 100644
index 000000000..68469e1f1
--- /dev/null
+++ b/.git-blame-ignore-revs
@@ -0,0 +1,16 @@
+# Exclude these commits from git-blame and similar tools.
+#
+# To use this file, run the following command from the repo root:
+#
+# ```
+# $ git config blame.ignoreRevsFile .git-blame-ignore-revs
+# ```
+#
+# Include a brief comment with each commit added, for example:
+#
+# ```
+# d92d9f8baac5ec48a8f8718dd69f415a45efe372 # Initial clang-format
+# ```
+#
+# Only add commits that are pure formatting changes (e.g.
+# clang-format version changes, etc).
diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
new file mode 100644
index 000000000..72def4091
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -0,0 +1,5 @@
+blank_issues_enabled: false
+contact_links:
+  - name: Open Issue in CCCL Repository
+    url: https://github.com/NVIDIA/cccl/issues/new/choose
+    about:  This repository has moved! Please see the new home for Thrust. 
diff --git a/.github/workflows/deploy-documentation-github-pages.yml b/.github/workflows/deploy-documentation-github-pages.yml
new file mode 100644
index 000000000..508764c5c
--- /dev/null
+++ b/.github/workflows/deploy-documentation-github-pages.yml
@@ -0,0 +1,27 @@
+name: Deploy Documentation GitHub Pages
+
+on:
+  push:
+    branches:
+      - "main"
+
+  # Trigger on request.
+  workflow_dispatch:
+
+jobs:
+  deploy-documentation-github-pages:
+    runs-on: ubuntu-latest
+    container: gpuci/cccl:cuda11.7.0-devel-ubuntu20.04-gcc9
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+      - name: Generate documentation markdown
+        run: ./docs/generate_markdown.bash --clean
+      - name: Deploy generated documentation markdown to gh-pages branch
+        uses: peaceiris/actions-gh-pages@v3
+        if: github.ref == 'refs/heads/main'
+        with:
+          github_token: ${{ secrets.GITHUB_TOKEN }}
+          publish_dir: ./build_docs/github_pages
+          enable_jekyll: true
+          commit_message: "Deploy Documentation: ${{ github.event.head_commit.message }}"
diff --git a/.github/workflows/mirror-main-branch-to-master-branch.yml b/.github/workflows/mirror-main-branch-to-master-branch.yml
new file mode 100644
index 000000000..f9c861a3f
--- /dev/null
+++ b/.github/workflows/mirror-main-branch-to-master-branch.yml
@@ -0,0 +1,17 @@
+on:
+  push:
+    branches:
+      - main
+
+jobs:
+  mirror-main-branch-to-master-branch:
+    name: Mirror main branch to master branch
+    runs-on: ubuntu-latest
+    steps:
+    - name: Mirror main branch to master branch
+      id: mirror
+      uses: google/mirror-branch-action@v1.0
+      with:
+        source: main
+        dest: master
+        github-token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/xrefcheck-validate-markdown-links.yml b/.github/workflows/xrefcheck-validate-markdown-links.yml
new file mode 100755
index 000000000..78e5ade71
--- /dev/null
+++ b/.github/workflows/xrefcheck-validate-markdown-links.yml
@@ -0,0 +1,18 @@
+name: Check bad links
+
+on:
+  push:
+    branches: [ '*' ]
+  pull_request:
+    branches: [ '*' ]
+
+jobs:
+  xrefcheck:
+    name: Check links
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v2
+    - uses: serokell/xrefcheck-action@v1
+      with:
+        xrefcheck-version: 0.2
+        xrefcheck-args: --ignored dependencies
diff --git a/.gitignore b/.gitignore
index 2dc8f7c8e..37d8ba566 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,4 @@
-thrust/system/cuda/detail/.gitignore
-.p4config
-run
-build
-doc/html
 discrete_voronoi.pgm
+*build*/
+.idea/
+.vscode
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 000000000..0bb39f302
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,6 @@
+[submodule "cub"]
+	path = dependencies/cub
+	url = ../cub.git
+[submodule "libcudacxx"]
+	path = dependencies/libcudacxx
+	url = ../libcudacxx.git
diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 000000000..57eff4212
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,2315 @@
+# Changelog
+
+## Thrust 2.1.0
+
+### New Features
+
+- NVIDIA/thrust#1805: Add default constructors to `transform_output_iterator`
+  and `transform_input_output_iterator`. Thanks to Mark Harris (@harrism) for this contribution.
+- NVIDIA/thrust#1836: Enable constructions of vectors from `std::initializer_list`.
+
+### Bug Fixes
+
+- NVIDIA/thrust#1768: Fix type conversion warning in the `thrust::complex` utilities. Thanks to
+  Zishi Wu (@zishiwu123) for this contribution.
+- NVIDIA/thrust#1809: Fix some warnings about usage of `__host__` functions in `__device__` code.
+- NVIDIA/thrust#1825: Fix Thrust's CMake install rules. Thanks to Robert Maynard (@robertmaynard)
+  for this contribution.
+- NVIDIA/thrust#1827: Fix `thrust::reduce_by_key` when using non-default-initializable iterators.
+- NVIDIA/thrust#1832: Fix bug in device-side CDP `thrust::reduce` when using a large number of
+  inputs.
+
+### Other Enhancements
+
+- NVIDIA/thrust#1815: Update Thrust's libcu++ git submodule to version 1.8.1.
+- NVIDIA/thrust#1841: Fix invalid code in execution policy documentation example. Thanks to Raphaël
+  Frantz (@Eren121) for this contribution.
+- NVIDIA/thrust#1848: Improve error messages when attempting to launch a kernel on a device that is
+  not supported by compiled PTX versions. Thanks to Zahra Khatami (@zkhatami) for this contribution.
+- NVIDIA/thrust#1855: Remove usage of deprecated CUDA error codes.
+
+## Thrust 2.0.1
+
+### Other Enhancements
+
+- Disable CDP parallelization of device-side invocations of Thrust algorithms on SM90+. The removal
+  of device-side synchronization support in recent architectures makes Thrust's fork-join model
+  unimplementable on device, so a serial implementation will be used instead. Host-side invocations
+  of Thrust algorithms are not affected.
+
+## Thrust 2.0.0
+
+### Summary
+
+The Thrust 2.0.0 major release adds a dependency on libcu++ and contains several
+breaking changes. These include new diagnostics when inspecting device-only
+lambdas from the host, removal of the `cub` symlink in the Thrust repository
+root, and removal of the deprecated `THRUST_*_BACKEND` macros. It also includes
+several minor bugfixes and cleanups.
+
+### Breaking Changes
+
+- NVIDIA/thrust#1605: Add libcu++ dependency.
+    - A suitable version of libcu++ is provided through
+      the `${THRUST_ROOT}/dependencies/libcudacxx/` submodule.
+    - Non-cmake users may need to add the libcu++ include path to their
+      builds (`-I ${THRUST_ROOT}/dependencies/libcudacxx/include/`).
+    - The Thrust CMake packages have been updated to add this include path.
+- NVIDIA/thrust#1605: The following macros are no longer defined by default.
+  They can be re-enabled by defining `THRUST_PROVIDE_LEGACY_ARCH_MACROS`. These
+  will be removed completely in a future release.
+    - `THRUST_IS_HOST_CODE`: Replace with `NV_IF_TARGET`.
+    - `THRUST_IS_DEVICE_CODE`: Replace with `NV_IF_TARGET`.
+    - `THRUST_INCLUDE_HOST_CODE`: Replace with `NV_IF_TARGET`.
+    - `THRUST_INCLUDE_DEVICE_CODE`: Replace with `NV_IF_TARGET`.
+    - `THRUST_DEVICE_CODE`: Replace with `NV_IF_TARGET`.
+- NVIDIA/thrust#1661: Thrust's CUDA Runtime support macros have been updated to
+  support `NV_IF_TARGET`. They are now defined consistently across all
+  host/device compilation passes. This should not affect most usages of these
+  macros, but may require changes for some edge cases.
+    - `THRUST_RUNTIME_FUNCTION`: Execution space annotations for functions that
+      invoke CUDA Runtime APIs.
+        - Old behavior:
+            - RDC enabled: Defined to `__host__ __device__`
+            - RDC not enabled:
+                - NVCC host pass: Defined to `__host__ __device__`
+                - NVCC device pass: Defined to `__host__`
+        - New behavior:
+            - RDC enabled: Defined to `__host__ __device__`
+            - RDC not enabled: Defined to `__host__`
+    - `__THRUST_HAS_CUDART__`: No change in behavior, but no longer used in
+      Thrust. Provided for legacy support only. Legacy behavior:
+        - RDC enabled: Defined to 1.
+        - RDC not enabled:
+            - NVCC host pass: Defined to 1.
+            - NVCC device pass: Defined to 0.
+    - `THRUST_RDC_ENABLED`: New macro, may be combined with `NV_IF_TARGET` to
+      replace most usages of `__THRUST_HAS_CUDART__`. Behavior:
+        - RDC enabled: Macro is defined.
+        - RDC not enabled: Macro is not defined.
+- NVIDIA/thrust#1701: Remove the `cub` symlink from the root of the Thrust
+  repository.
+    - This symlink caused issues in certain build environments (e.g.
+      NVIDIA/thrust#1328).
+    - Builds that relied on this symlink will need to add the full CUB include
+      path (`-I ${THRUST_ROOT}/dependencies/cub`).
+    - CMake builds that use the Thrust packages via CPM, `add_subdirectory`,
+      or `find_package` are not affected.
+- NVIDIA/thrust#1760: A compile-time error is now emitted when a `__device__`
+  -only lambda's return type is queried from host code (requires libcu++ ≥
+  1.9.0).
+    - Due to limitations in the CUDA programming model, the result of this query
+      is unreliable, and will silently return an incorrect result. This leads to
+      difficult to debug errors.
+    - When using libcu++ 1.9.0, an error will be emitted with information about
+      work-arounds:
+        - Use a named function object with a `__device__`-only implementation
+          of `operator()`.
+        - Use a `__host__ __device__` lambda.
+        - Use `cuda::proclaim_return_type` (Added in libcu++ 1.9.0)
+- NVIDIA/thrust#1761: Removed support for deprecated `THRUST_DEVICE_BACKEND`
+  and `THRUST_HOST_BACKEND` macros. The `THRUST_DEVICE_SYSTEM`
+  and `THRUST_HOST_SYSTEM` macros should be used instead.
+
+### Bug Fixes
+
+- NVIDIA/thrust#1605: Fix some execution space warnings in the allocator
+  library.
+- NVIDIA/thrust#1683: Fix bug in `iterator_category_to_traversal` metafunctions.
+- NVIDIA/thrust#1715: Add missing `__thrust_exec_check_disable__` annotation
+  to `thrust::make_zip_function`. Thanks to @mfbalin for this contribution.
+- NVIDIA/thrust#1722: Remove CUDA-specific error handler from code that may be
+  executed on non-CUDA backends. Thanks to @dkolsen-pgi for this contribution.
+- NVIDIA/thrust#1756: Fix `copy_if` for output iterators that don't support copy
+  assignment. Thanks for @mfbalin for this contribution.
+
+### Other Enhancements
+
+- NVIDIA/thrust#1605: Removed special case code for unsupported CUDA
+  architectures.
+- NVIDIA/thrust#1605: Replace several usages of `__CUDA_ARCH__`
+  with `<nv/target>` to handle host/device code divergence.
+- NVIDIA/thrust#1752: Remove a leftover merge conflict from a documentation
+  file. Thanks to @tabedzki for this contribution.
+
+## Thrust 1.17.2
+
+### Summary
+
+Thrust 1.17.2 is a minor bugfix release that provides an updated version of CUB.
+
+## Thrust 1.17.1
+
+### Summary
+
+Thrust 1.17.1 is a minor bugfix release that provides an updated version of CUB.
+
+## Thrust 1.17.0
+
+### Summary
+
+Thrust 1.17.0 is the final minor release of the 1.X series. This release
+provides GDB pretty-printers for device vectors/references, a new `unique_count`
+algorithm, and an easier way to create tagged Thrust iterators. Several
+documentation fixes are included, which can be found on the new Thrust
+documentation site at https://nvidia.github.io/thrust. We'll be migrating
+existing documentation sources to this new location over the next few months.
+
+### New Features
+
+- NVIDIA/thrust#1586: Add new `thrust::make_tagged_iterator` convenience
+  function. Thanks to @karthikeyann for this contribution.
+- NVIDIA/thrust#1619: Add `unique_count` algorithm. Thanks to @upsj for this
+  contribution.
+- NVIDIA/thrust#1631: Add GDB pretty-printers for device vectors/references
+  to `scripts/gdb-pretty-printers.py`. Thanks to @upsj for this contribution.
+
+### Bug Fixes
+
+- NVIDIA/thrust#1671: Fixed `reduce_by_key` when called with 2^31 elements.
+
+### Other Enhancements
+
+- NVIDIA/thrust#1512: Use CUB to implement `adjacent_difference`.
+- NVIDIA/thrust#1555: Use CUB to implement `scan_by_key`.
+- NVIDIA/thrust#1611: Add new doxybook-based Thrust documentation
+  at https://nvidia.github.io/thrust.
+- NVIDIA/thrust#1639: Fixed broken link in documentation. Thanks to @jrhemstad
+  for this contribution.
+- NVIDIA/thrust#1644: Increase contrast of search input text in new doc site.
+  Thanks to @bdice for this contribution.
+- NVIDIA/thrust#1647: Add `__forceinline__` annotations to a functor wrapper.
+  Thanks to @mkuron for this contribution.
+- NVIDIA/thrust#1660: Fixed typo in documentation example for
+  `permutation_iterator`.
+- NVIDIA/thrust#1669: Add a new `explicit_cuda_stream.cu` example that shows how
+  to use explicit CUDA streams and `par`/`par_nosync` execution policies.
+
+## Thrust 1.16.0
+
+### Summary
+
+Thrust 1.16.0 provides a new “nosync” hint for the CUDA backend, as well as
+numerous bugfixes and stability improvements.
+
+#### New `thrust::cuda::par_nosync` Execution Policy
+
+Most of Thrust's parallel algorithms are fully synchronous and will block the
+calling CPU thread until all work is completed. This design avoids many pitfalls
+associated with asynchronous GPU programming, resulting in simpler and
+less-error prone usage for new CUDA developers. Unfortunately, this improvement
+in user experience comes at a performance cost that often frustrates more
+experienced CUDA programmers.
+
+Prior to this release, the only synchronous-to-asynchronous migration path for
+existing Thrust codebases involved significant refactoring, replacing calls
+to `thrust` algorithms with a limited set of `future`-based `thrust::async`
+algorithms or lower-level CUB kernels. The new `thrust::cuda::par_nosync`
+execution policy provides a new, less-invasive entry point for asynchronous
+computation.
+
+`par_nosync` is a hint to the Thrust execution engine that any non-essential
+internal synchronizations should be skipped and that an explicit synchronization
+will be performed by the caller before accessing results.
+
+While some Thrust algorithms require internal synchronization to safely compute
+their results, many do not. For example, multiple `thrust::for_each` invocations
+can be launched without waiting for earlier calls to complete:
+
+```cpp
+// Queue three `for_each` kernels:
+thrust::for_each(thrust::cuda::par_nosync, vec1.begin(), vec1.end(), Op{});
+thrust::for_each(thrust::cuda::par_nosync, vec2.begin(), vec2.end(), Op{});
+thrust::for_each(thrust::cuda::par_nosync, vec3.begin(), vec3.end(), Op{});
+
+// Do other work while kernels execute:
+do_something();
+
+// Must explictly synchronize before accessing `for_each` results:
+cudaDeviceSynchronize();
+```
+
+Thanks to @fkallen for this contribution.
+
+### Deprecation Notices
+
+#### CUDA Dynamic Parallelism Support
+
+**A future version of Thrust will remove support for CUDA Dynamic Parallelism
+(CDP).**
+
+This will only affect calls to Thrust algorithms made from CUDA device-side code
+that currently launches a kernel; such calls will instead execute sequentially
+on the calling GPU thread instead of launching a device-wide kernel.
+
+### Breaking Changes
+
+- Thrust 1.14.0 included a change that aliased the `cub` namespace
+  to `thrust::cub`. This has caused issues with ambiguous namespaces for
+  projects that declare `using namespace thrust;` from the global namespace. We
+  recommend against this practice.
+- NVIDIA/thrust#1572: Removed several unnecessary header includes. Downstream
+  projects may need to update their includes if they were relying on this
+  behavior.
+
+### New Features
+
+- NVIDIA/thrust#1568: Add `thrust::cuda::par_nosync` policy. Thanks to @fkallen
+  for this contribution.
+
+### Enhancements
+
+- NVIDIA/thrust#1511: Use CUB's new `DeviceMergeSort` API and remove Thrust's
+  internal implementation.
+- NVIDIA/thrust#1566: Improved performance of `thrust::shuffle`. Thanks to
+  @djns99 for this contribution.
+- NVIDIA/thrust#1584: Support user-defined `CMAKE_INSTALL_INCLUDEDIR` values in
+  Thrust's CMake install rules. Thanks to @robertmaynard for this contribution.
+
+### Bug Fixes
+
+- NVIDIA/thrust#1496: Fix some issues affecting `icc` builds.
+- NVIDIA/thrust#1552: Fix some collisions with the `min`/`max`  macros defined
+  in `windows.h`.
+- NVIDIA/thrust#1582: Fix issue with function type alias on 32-bit MSVC builds.
+- NVIDIA/thrust#1591: Workaround issue affecting compilation with `nvc++`.
+- NVIDIA/thrust#1597: Fix some collisions with the `small` macro defined
+  in `windows.h`.
+- NVIDIA/thrust#1599, NVIDIA/thrust#1603: Fix some issues with version handling
+  in Thrust's CMake packages.
+- NVIDIA/thrust#1614: Clarify that scan algorithm results are non-deterministic
+  for pseudo-associative operators (e.g. floating-point addition).
+
+## Thrust 1.15.0
+
+### Summary
+
+Thrust 1.15.0 provides numerous bugfixes, including non-numeric
+`thrust::sequence` support, several MSVC-related compilation fixes, fewer
+conversion warnings, `counting_iterator` initialization, and documentation
+updates.
+
+### Deprecation Notices
+
+**A future version of Thrust will remove support for CUDA Dynamic Parallelism
+(CDP).**
+
+This will only affect calls to Thrust algorithms made from CUDA device-side code
+that currently launches a kernel; such calls will instead execute sequentially
+on the calling GPU thread instead of launching a device-wide kernel.
+
+### Bug Fixes
+
+- NVIDIA/thrust#1507: Allow `thrust::sequence` to work with non-numeric types.
+  Thanks to Ben Jude (@bjude) for this contribution.
+- NVIDIA/thrust#1509: Avoid macro collision when calling `max()` on MSVC. Thanks
+  to Thomas (@tomintheshell) for this contribution.
+- NVIDIA/thrust#1514: Initialize all members in `counting_iterator`'s default
+  constructor.
+- NVIDIA/thrust#1518: Fix `std::allocator_traits` on MSVC + C++17.
+- NVIDIA/thrust#1530: Fix several `-Wconversion` warnings. Thanks to Matt
+  Stack (@matt-stack) for this contribution.
+- NVIDIA/thrust#1539: Fixed typo in `thrust::for_each` documentation. Thanks to
+  Salman (@untamedImpala) for this contribution.
+- NVIDIA/thrust#1548: Avoid name collision with `B0` macro in termios.h system
+  header. Thanks to Philip Deegan (@PhilipDeegan) for this contribution.
+
+## Thrust 1.14.0 (NVIDIA HPC SDK 21.9)
+
+Thrust 1.14.0 is a major release accompanying the NVIDIA HPC SDK 21.9.
+
+This release adds the ability to wrap the `thrust::` namespace in an external
+namespace, providing a workaround for a variety of shared library linking
+issues. Thrust also learned to detect when CUB's symbols are in a wrapped
+namespace and properly import them. To enable this feature, use
+`#define THRUST_CUB_WRAPPED_NAMESPACE foo` to wrap both Thrust and CUB in the
+`foo::` namespace. See `thrust/detail/config/namespace.h` for details and more
+namespace options.
+
+Several bugfixes are also included: The `tuple_size` and `tuple_element` helpers
+now support cv-qualified types. `scan_by_key` uses less memory.
+`thrust::iterator_traits` is better integrated with `std::iterator_traits`.
+See below for more details and references.
+
+### Breaking Changes
+
+- Thrust 1.14.0 included a change that aliased the `cub` namespace
+  to `thrust::cub`. This has caused issues with ambiguous namespaces for
+  projects that declare `using namespace thrust;` from the global namespace. We
+  recommend against this practice.
+
+### New Features
+
+- NVIDIA/thrust#1464: Add preprocessor hooks that allow `thrust::` to be wrapped
+  in an external namespace, and support cases when CUB is wrapped in an external
+  namespace.
+
+### Bug Fixes
+
+- NVIDIA/thrust#1457: Support cv-qualified types in `thrust::tuple_size` and
+  `thrust::tuple_element`. Thanks to Jake Hemstad for this contribution.
+- NVIDIA/thrust#1471: Fixed excessive memory allocation in `scan_by_key`. Thanks
+  to Lilo Huang for this contribution.
+- NVIDIA/thrust#1476: Removed dead code from the `expand` example. Thanks to
+  Lilo Huang for this contribution.
+- NVIDIA/thrust#1488: Fixed the path to the installed CUB headers in the CMake
+  `find_package` configuration files.
+- NVIDIA/thrust#1491: Fallback to `std::iterator_traits` when no
+  `thrust::iterator_traits` specialization exists for an iterator type. Thanks
+  to Divye Gala for this contribution.
+
+## Thrust 1.13.1 (CUDA Toolkit 11.5)
+
+Thrust 1.13.1 is a minor release accompanying the CUDA Toolkit 11.5.
+
+This release provides a new hook for embedding the `thrust::` namespace inside a
+custom namespace. This is intended to work around various issues related to
+linking multiple shared libraries that use Thrust. The existing `CUB_NS_PREFIX`
+and `CUB_NS_POSTFIX` macros already provided this capability for CUB; this
+update provides a simpler mechanism that is extended to and integrated with
+Thrust. Simply define `THRUST_CUB_WRAPPED_NAMESPACE` to a namespace name, and
+both `thrust::` and `cub::` will be placed inside the new namespace. Using
+different wrapped namespaces for each shared library will prevent issues like
+those reported in NVIDIA/thrust#1401.
+
+### New Features
+
+- NVIDIA/thrust#1464: Add `THRUST_CUB_WRAPPED_NAMESPACE` hooks.
+
+### Bug Fixes
+
+- NVIDIA/thrust#1488: Fix path to installed CUB in Thrust's CMake config files.
+
+## Thrust 1.13.0 (NVIDIA HPC SDK 21.7)
+
+Thrust 1.13.0 is the major release accompanying the NVIDIA HPC SDK 21.7 release.
+Notable changes include `bfloat16` radix sort support (via `thrust::sort`) and
+  memory handling fixes in the `reserve` method of Thrust's vectors.
+The `CONTRIBUTING.md` file has been expanded to include instructions for
+  building CUB as a component of Thrust, and API documentation now refers to
+  [cppreference](https://cppreference.com) instead of SGI's old STL reference.
+
+### Breaking Changes
+
+- NVIDIA/thrust#1459: Remove deprecated aliases `thrust::host_space_tag` and
+  `thrust::device_space_tag`. Use the equivalent `thrust::host_system_tag` and
+  `thrust::device_system_tag` instead.
+
+### New Features
+
+- NVIDIA/cub#306: Add radix-sort support for `bfloat16` in `thrust::sort`.
+  Thanks to Xiang Gao (@zasdfgbnm) for this contribution.
+- NVIDIA/thrust#1423: `thrust::transform_iterator` now supports non-copyable
+  types. Thanks to Jake Hemstad (@jrhemstad) for this contribution.
+- NVIDIA/thrust#1459: Introduce a new `THRUST_IGNORE_DEPRECATED_API` macro that
+  disables deprecation warnings on Thrust and CUB APIs.
+
+### Bug Fixes
+
+- NVIDIA/cub#277: Fixed sanitizer warnings when `thrust::sort` calls
+  into `cub::DeviceRadixSort`. Thanks to Andy Adinets (@canonizer) for this
+  contribution.
+- NVIDIA/thrust#1442: Reduce extraneous comparisons in `thrust::sort`'s merge
+  sort implementation.
+- NVIDIA/thrust#1447: Fix memory leak and avoid overallocation when
+  calling `reserve` on Thrust's vector containers. Thanks to Kai Germaschewski
+  (@germasch) for this contribution.
+
+### Other Enhancements
+
+- NVIDIA/thrust#1405: Update links to standard C++ documentations from sgi to
+  cppreference. Thanks to Muhammad Adeel Hussain (@AdeilH) for this
+  contribution.
+- NVIDIA/thrust#1432: Updated build instructions in `CONTRIBUTING.md` to include
+  details on building CUB's test suite as part of Thrust.
+
+## Thrust 1.12.1 (CUDA Toolkit 11.4)
+
+Thrust 1.12.1 is a trivial patch release that slightly changes the phrasing of
+a deprecation message.
+
+## Thrust 1.12.0 (NVIDIA HPC SDK 21.3)
+
+Thrust 1.12.0 is the major release accompanying the NVIDIA HPC SDK 21.3
+  and the CUDA Toolkit 11.4.
+It includes a new `thrust::universal_vector`, which holds data that is
+  accessible from both host and device. This allows users to easily leverage
+  CUDA's unified memory with Thrust.
+New asynchronous `thrust::async:exclusive_scan` and `inclusive_scan` algorithms
+  have been added, and the synchronous versions of these have been updated to
+  use `cub::DeviceScan` directly.
+CUB radix sort for floating point types is now stable when both +0.0 and -0.0
+  are present in the input. This affects some usages of `thrust::sort` and
+  `thrust::stable_sort`.
+Many compilation warnings and subtle overflow bugs were fixed in the device
+  algorithms, including a long-standing bug that returned invalid temporary
+  storage requirements when `num_items` was close to (but not
+  exceeding) `INT32_MAX`.
+This release deprecates support for Clang < 7.0 and MSVC < 2019 (aka
+  19.20/16.0/14.20).
+
+### Breaking Changes
+
+- NVIDIA/thrust#1372: Deprecate Clang < 7 and MSVC < 2019.
+- NVIDIA/thrust#1376: Standardize `thrust::scan_by_key` functors / accumulator
+    types.
+  This may change the results from `scan_by_key` when input, output, and
+    initial value types are not the same type.
+
+### New Features
+
+- NVIDIA/thrust#1251: Add two new `thrust::async::` algorithms: `inclusive_scan`
+    and `exclusive_scan`.
+- NVIDIA/thrust#1334: Add `thrust::universal_vector`, `universal_ptr`,
+    and `universal_allocator`.
+
+### Bug Fixes
+
+- NVIDIA/thrust#1347: Qualify calls to `make_reverse_iterator`.
+- NVIDIA/thrust#1359: Enable stricter warning flags. This fixes several
+  outstanding issues:
+  - NVIDIA/cub#221: Overflow in `temp_storage_bytes` when `num_items` close to
+      (but not over) `INT32_MAX`.
+  - NVIDIA/cub#228: CUB uses non-standard C++ extensions that break strict
+      compilers.
+  - NVIDIA/cub#257: Warning when compiling `GridEvenShare` with unsigned
+      offsets.
+  - NVIDIA/thrust#974: Conversion warnings in `thrust::transform_reduce`.
+  - NVIDIA/thrust#1091: Conversion warnings in `thrust::counting_iterator`.
+- NVIDIA/thrust#1373: Fix compilation error when a standard library type is
+    wrapped in `thrust::optional`.
+  Thanks to Vukasin Milovanovic for this contribution.
+- NVIDIA/thrust#1388: Fix `signbit(double)` implementation on MSVC.
+- NVIDIA/thrust#1389: Support building Thrust tests without CUDA enabled.
+
+### Other Enhancements
+
+- NVIDIA/thrust#1304: Use `cub::DeviceScan` to implement
+    `thrust::exclusive_scan` and `thrust::inclusive_scan`.
+- NVIDIA/thrust#1362, NVIDIA/thrust#1370: Update smoke test naming.
+- NVIDIA/thrust#1380: Fix typos in `set_operation` documentation.
+    Thanks to Hongyu Cai for this contribution.
+- NVIDIA/thrust#1383: Include FreeBSD license in LICENSE.md for
+  `thrust::complex` implementation.
+- NVIDIA/thrust#1384: Add missing precondition to `thrust::gather`
+    documentation.
+
+## Thrust 1.11.0 (CUDA Toolkit 11.3)
+
+Thrust 1.11.0 is a major release providing bugfixes and performance
+  enhancements.
+It includes a new sort algorithm that provides up to 2x more performance
+  from `thrust::sort` when used with certain key types and hardware.
+The new `thrust::shuffle` algorithm has been tweaked to improve the randomness
+  of the output.
+Our CMake package and build system continue to see improvements with
+  better `add_subdirectory` support, installation rules, status messages, and
+  other features that make Thrust easier to use from CMake projects.
+The release includes several other bugfixes and modernizations, and received
+  updates from 12 contributors.
+
+### New Features
+
+- NVIDIA/cub#204: New implementation for `thrust::sort` on CUDA when using
+    32/64-bit numeric keys on Pascal and up (SM60+).
+  This improved radix sort algorithm provides up to 2x more performance.
+  Thanks for Andy Adinets for this contribution.
+- NVIDIA/thrust#1310, NVIDIA/thrust#1312: Various tuple-related APIs have been
+    updated to use variadic templates.
+  Thanks for Andrew Corrigan for these contributions.
+- NVIDIA/thrust#1297: Optionally add install rules when included with
+    CMake's `add_subdirectory`.
+  Thanks to Kai Germaschewski for this contribution.
+
+### Bug Fixes
+
+- NVIDIA/thrust#1309: Fix `thrust::shuffle` to produce better quality random
+    distributions.
+  Thanks to Rory Mitchell and Daniel Stokes for this contribution.
+- NVIDIA/thrust#1337: Fix compile-time regression in `transform_inclusive_scan`
+    and `transform_exclusive_scan`.
+- NVIDIA/thrust#1306: Fix binary search `middle` calculation to avoid overflows.
+    Thanks to Richard Barnes for this contribution.
+- NVIDIA/thrust#1314: Use `size_t` for the index type parameter
+    in `thrust::tuple_element`.
+  Thanks to Andrew Corrigan for this contribution.
+- NVIDIA/thrust#1329: Fix runtime error when copying an empty
+    `thrust::device_vector` in MSVC Debug builds.
+  Thanks to Ben Jude for this contribution.
+- NVIDIA/thrust#1323: Fix and add test for cmake package install rules.
+  Thanks for Keith Kraus and Kai Germaschewski for testing and discussion.
+- NVIDIA/thrust#1338: Fix GCC version checks in `thrust::detail::is_pod`
+    implementation.
+  Thanks to Anatoliy Tomilov for this contribution.
+- NVIDIA/thrust#1289: Partial fixes for Clang 10 as host compiler.
+  Filed an NVCC bug that will be fixed in a future version of the CUDA Toolkit
+    (NVBug 3136307).
+- NVIDIA/thrust#1272: Fix ambiguous `iter_swap` call when
+    using `thrust::partition` with STL containers.
+  Thanks to Isaac Deutsch for this contribution.
+- NVIDIA/thrust#1281: Update our bundled `FindTBB.cmake` module to support
+    latest MSVC.
+- NVIDIA/thrust#1298: Use semantic versioning rules for our CMake package's
+    compatibility checks.
+  Thanks to Kai Germaschewski for this contribution.
+- NVIDIA/thrust#1300: Use `FindPackageHandleStandardArgs` to print standard
+    status messages when our CMake package is found.
+  Thanks to Kai Germaschewski for this contribution.
+- NVIDIA/thrust#1320: Use feature-testing instead of a language dialect check
+    for `thrust::remove_cvref`.
+  Thanks to Andrew Corrigan for this contribution.
+- NVIDIA/thrust#1319: Suppress GPU deprecation warnings.
+
+### Other Enhancements
+
+- NVIDIA/cub#213: Removed some tuning policies for unsupported hardware (<SM35).
+- References to the old Github repository and branch names were updated.
+  - Github's `thrust/cub` repository is now `NVIDIA/cub`.
+  - Development has moved from the `master` branch to the `main` branch.
+
+## Thrust 1.10.0 (NVIDIA HPC SDK 20.9, CUDA Toolkit 11.2)
+
+Thrust 1.10.0 is the major release accompanying the NVIDIA HPC SDK 20.9 release
+  and the CUDA Toolkit 11.2 release.
+It drops support for C++03, GCC < 5, Clang < 6, and MSVC < 2017.
+It also overhauls CMake support.
+Finally, we now have a Code of Conduct for contributors:
+https://github.com/NVIDIA/thrust/blob/main/CODE_OF_CONDUCT.md
+
+### Breaking Changes
+
+- C++03 is no longer supported.
+- GCC < 5, Clang < 6, and MSVC < 2017 are no longer supported.
+- C++11 is deprecated.
+  Using this dialect will generate a compile-time warning.
+  These warnings can be suppressed by defining
+    `THRUST_IGNORE_DEPRECATED_CPP_DIALECT` or `THRUST_IGNORE_DEPRECATED_CPP_11`.
+  Suppression is only a short term solution.
+  We will be dropping support for C++11 in the near future.
+- Asynchronous algorithms now require C++14.
+- CMake < 3.15 is no longer supported.
+- The default branch on GitHub is now called `main`.
+- Allocator and vector classes have been replaced with alias templates.
+
+### New Features
+
+- NVIDIA/thrust#1159: CMake multi-config support, which allows multiple
+    combinations of host and device systems to be built and tested at once.
+  More details can be found here: https://github.com/NVIDIA/thrust/blob/main/CONTRIBUTING.md#multi-config-cmake-options
+- CMake refactoring:
+  - Added install targets to CMake builds.
+  - Added support for CUB tests and examples.
+  - Thrust can be added to another CMake project by calling `add_subdirectory`
+      with the Thrust source root (see NVIDIA/thrust#976).
+    An example can be found here:
+      https://github.com/NVIDIA/thrust/blob/main/examples/cmake/add_subdir/CMakeLists.txt
+  - CMake < 3.15 is no longer supported.
+  - Dialects are now configured through target properties.
+    A new `THRUST_CPP_DIALECT` option has been added for single config mode.
+    Logic that modified `CMAKE_CXX_STANDARD` and `CMAKE_CUDA_STANDARD` has been
+      eliminated.
+  - Testing related CMake code has been moved to `testing/CMakeLists.txt`
+  - Example related CMake code has been moved to `examples/CMakeLists.txt`
+  - Header testing related CMake code has been moved to `cmake/ThrustHeaderTesting.cmake`
+  - CUDA configuration CMake code has been moved to to `cmake/ThrustCUDAConfig.cmake`.
+  - Now we explicitly `include(cmake/*.cmake)` files rather than searching
+      `CMAKE_MODULE_PATH` - we only want to use the ones in the repo.
+- `thrust::transform_input_output_iterator`, a variant of transform iterator
+    adapter that works as both an input iterator and an output iterator.
+  The given input function is applied after reading from the wrapped iterator
+    while the output function is applied before writing to the wrapped iterator.
+  Thanks to Trevor Smith for this contribution.
+
+### Other Enhancements
+
+- Contributor documentation: https://github.com/NVIDIA/thrust/blob/main/CONTRIBUTING.md
+- Code of Conduct: https://github.com/NVIDIA/thrust/blob/main/CODE_OF_CONDUCT.md.
+  Thanks to Conor Hoekstra for this contribution.
+- Support for all combinations of host and device systems.
+- C++17 support.
+- NVIDIA/thrust#1221: Allocator and vector classes have been replaced with
+    alias templates.
+  Thanks to Michael Francis for this contribution.
+- NVIDIA/thrust#1186: Use placeholder expressions to simplify the definitions
+    of a number of algorithms.
+  Thanks to Michael Francis for this contribution.
+- NVIDIA/thrust#1170: More conforming semantics for scan algorithms:
+  - Follow P0571's guidance regarding intermediate types.
+    - https://wg21.link/P0571
+    - The accumulator's type is now:
+      - The type of the user-supplied initial value (if provided), or
+      - The input iterator's value type if no initial value.
+  - Follow C++ standard guidance for default binary operator type.
+    - https://eel.is/c++draft/exclusive.scan#1
+    - Thrust binary/unary functors now specialize a default void template
+        parameter.
+      Types are deduced and forwarded transparently.
+    - Updated the scan's default binary operator to the new `thrust::plus<>`
+        specialization.
+  - The `thrust::intermediate_type_from_function_and_iterators` helper is no
+      longer needed and has been removed.
+- NVIDIA/thrust#1255: Always use `cudaStreamSynchronize` instead of
+    `cudaDeviceSynchronize` if the execution policy has a stream attached to it.
+  Thanks to Rong Ou for this contribution.
+- NVIDIA/thrust#1201: Tests for correct handling of legacy and per-thread
+    default streams.
+  Thanks to Rong Ou for this contribution.
+
+### Bug Fixes
+
+- NVIDIA/thrust#1260: Fix `thrust::transform_inclusive_scan` with heterogeneous
+    types.
+  Thanks to Rong Ou for this contribution.
+- NVIDIA/thrust#1258, NVC++ FS #28463: Ensure the CUDA radix sort backend
+    synchronizes before returning; otherwise, copies from temporary storage will
+    race with destruction of said temporary storage.
+- NVIDIA/thrust#1264: Evaluate `CUDA_CUB_RET_IF_FAIL` macro argument only once.
+  Thanks to Jason Lowe for this contribution.
+- NVIDIA/thrust#1262: Add missing `<stdexcept>` header.
+- NVIDIA/thrust#1250: Restore some `THRUST_DECLTYPE_RETURNS` macros in async
+    test implementations.
+- NVIDIA/thrust#1249: Use `std::iota` in `CUDATestDriver::target_devices`.
+  Thanks to Michael Francis for this contribution.
+- NVIDIA/thrust#1244: Check for macro collisions with system headers during
+    header testing.
+- NVIDIA/thrust#1224: Remove unnecessary SFINAE contexts from asynchronous
+    algorithms.
+- NVIDIA/thrust#1190: Make `out_of_memory_recovery` test trigger faster.
+- NVIDIA/thrust#1187: Elminate superfluous iterators specific to the CUDA
+    backend.
+- NVIDIA/thrust#1181: Various fixes for GoUDA.
+  Thanks to Andrei Tchouprakov for this contribution.
+- NVIDIA/thrust#1178, NVIDIA/thrust#1229: Use transparent functionals in
+    placeholder expressions, fixing issues with `thrust::device_reference` and
+    placeholder expressions and `thrust::find` with asymmetric equality
+    operators.
+- NVIDIA/thrust#1153: Switch to placement new instead of assignment to
+    construct items in uninitialized memory.
+  Thanks to Hugh Winkler for this contribution.
+- NVIDIA/thrust#1050: Fix compilation of asynchronous algorithms when RDC is
+    enabled.
+- NVIDIA/thrust#1042: Correct return type of
+    `thrust::detail::predicate_to_integral` from `bool` to `IntegralType`.
+  Thanks to Andreas Hehn for this contribution.
+- NVIDIA/thrust#1009: Avoid returning uninitialized allocators.
+  Thanks to Zhihao Yuan for this contribution.
+- NVIDIA/thrust#990: Add missing `<thrust/system/cuda/memory.h>` include to
+    `<thrust/system/cuda/detail/malloc_and_free.h>`.
+  Thanks to Robert Maynard for this contribution.
+- NVIDIA/thrust#966: Fix spurious MSVC conversion with loss of data warning in
+    sort algorithms.
+  Thanks to Zhihao Yuan for this contribution.
+- Add more metadata to mock specializations for testing iterator in
+   `testing/copy.cu`.
+- Add missing include to shuffle unit test.
+- Specialize `thrust::wrapped_function` for `void` return types because MSVC is
+    not a fan of the pattern `return static_cast<void>(expr);`.
+- Replace deprecated `tbb/tbb_thread.h` with `<thread>`.
+- Fix overcounting of initial value in TBB scans.
+- Use `thrust::advance` instead of `+=` for generic iterators.
+- Wrap the OMP flags in `-Xcompiler` for NVCC
+- Extend `ASSERT_STATIC_ASSERT` skip for the OMP backend.
+- Add missing header caught by `tbb.cuda` configs.
+- Fix "unsafe API" warnings in examples on MSVC: `s/fopen/fstream/`
+- Various C++17 fixes.
+
+## Thrust 1.9.10-1 (NVIDIA HPC SDK 20.7, CUDA Toolkit 11.1)
+
+Thrust 1.9.10-1 is the minor release accompanying the NVIDIA HPC SDK 20.7 release
+  and the CUDA Toolkit 11.1 release.
+
+### Bug Fixes
+
+- #1214, NVBug 200619442: Stop using `std::allocator` APIs deprecated in C++17.
+- #1216, NVBug 200540293: Make `thrust::optional` work with Clang when used
+    with older libstdc++.
+- #1207, NVBug 200618218: Don't force C++14 with older compilers that don't
+    support it.
+- #1218: Wrap includes of `<memory>` and `<algorithm>` to avoid circular
+    inclusion with NVC++.
+
+## Thrust 1.9.10 (NVIDIA HPC SDK 20.5)
+
+Thrust 1.9.10 is the release accompanying the NVIDIA HPC SDK 20.5 release.
+It adds CMake support for compilation with NVC++ and a number of minor bug fixes
+  for NVC++.
+It also adds CMake `find_package` support, which replaces the broken 3rd-party
+  legacy `FindThrust.cmake` script.
+C++03, C++11, GCC < 5, Clang < 6, and MSVC < 2017 are now deprecated.
+Starting with the upcoming 1.10.0 release, C++03 support will be dropped
+  entirely.
+
+### Breaking Changes
+
+- #1082: Thrust now checks that it is compatible with the version of CUB found
+    in your include path, generating an error if it is not.
+  If you are using your own version of CUB, it may be too old.
+  It is recommended to simply delete your own version of CUB and use the
+    version of CUB that comes with Thrust.
+- #1089: C++03 and C++11 are deprecated.
+  Using these dialects will generate a compile-time warning.
+  These warnings can be suppressed by defining
+    `THRUST_IGNORE_DEPRECATED_CPP_DIALECT` (to suppress C++03 and C++11
+    deprecation warnings) or `THRUST_IGNORE_DEPRECATED_CPP11` (to suppress C++11
+    deprecation warnings).
+  Suppression is only a short term solution.
+  We will be dropping support for C++03 in the 1.10.0 release and C++11 in the
+    near future.
+- #1089: GCC < 5, Clang < 6, and MSVC < 2017 are deprecated.
+  Using these compilers will generate a compile-time warning.
+  These warnings can be suppressed by defining
+    `THRUST_IGNORE_DEPRECATED_COMPILER`.
+  Suppression is only a short term solution.
+  We will be dropping support for these compilers in the near future.
+
+### New Features
+
+- #1130: CMake `find_package` support.
+  This is significant because there is a legacy `FindThrust.cmake` script
+    authored by a third party in widespread use in the community which has a
+    bug in how it parses Thrust version numbers which will cause it to
+    incorrectly parse 1.9.10.
+  This script only handles the first digit of each part of the Thrust version
+    number correctly: for example, Thrust 17.17.17 would be interpreted as
+    Thrust 1.1.1701717.
+  You can find directions for using the new CMake `find_package` support and
+    migrating away from the legacy `FindThrust.cmake` [here](https://github.com/NVIDIA/thrust/blob/main/thrust/cmake/README.md)
+- #1129: Added `thrust::detail::single_device_tls_caching_allocator`, a
+    convenient way to get an MR caching allocator for device memory, which is
+    used by NVC++.
+
+### Other Enhancements
+
+- #1129: Refactored RDC handling in CMake to be a global option and not create
+    two targets for each example and test.
+
+### Bug Fixes
+
+- #1129: Fix the legacy `thrust::return_temporary_buffer` API to support
+    passing a size.
+  This was necessary to enable usage of Thrust caching MR allocators with
+    synchronous Thrust algorithms.
+  This change has allowed NVC++'s C++17 Parallel Algorithms implementation to
+    switch to use Thrust caching MR allocators for device temporary storage,
+    which gives a 2x speedup on large multi-GPU systems such as V100 and A100
+    DGX where `cudaMalloc` is very slow.
+- #1128: Respect `CUDA_API_PER_THREAD_DEFAULT_STREAM`.
+  Thanks to Rong Ou for this contribution.
+- #1131: Fix the one-policy overload of `thrust::async::copy` to not copy the
+    policy, resolving use-afer-move issues.
+- #1145: When cleaning up type names in `unittest::base_class_name`, only call
+    `std::string::replace` if we found the substring we are looking to replace.
+- #1139: Don't use `cxx::__demangle` in NVC++.
+- #1102: Don't use `thrust::detail::normal_distribution_nvcc` for Feta because
+    it uses `erfcinv`, a non-standard function that Feta doesn't have.
+
+## Thrust 1.9.9 (CUDA Toolkit 11.0)
+
+Thrust 1.9.9 adds support for NVC++, which uses Thrust to implement
+  GPU-accelerated C++17 Parallel Algorithms.
+`thrust::zip_function` and `thrust::shuffle` were also added.
+C++03, C++11, GCC < 5, Clang < 6, and MSVC < 2017 are now deprecated.
+Starting with the upcoming 1.10.0 release, C++03 support will be dropped
+  entirely.
+All other deprecated platforms will be dropped in the near future.
+
+### Breaking Changes
+
+- #1082: Thrust now checks that it is compatible with the version of CUB found
+    in your include path, generating an error if it is not.
+  If you are using your own version of CUB, it may be too old.
+  It is recommended to simply delete your own version of CUB and use the
+    version of CUB that comes with Thrust.
+- #1089: C++03 and C++11 are deprecated.
+  Using these dialects will generate a compile-time warning.
+  These warnings can be suppressed by defining
+    `THRUST_IGNORE_DEPRECATED_CPP_DIALECT` (to suppress C++03 and C++11
+    deprecation warnings) or `THRUST_IGNORE_DEPRECATED_CPP_11` (to suppress C++11
+    deprecation warnings).
+  Suppression is only a short term solution.
+  We will be dropping support for C++03 in the 1.10.0 release and C++11 in the
+    near future.
+- #1089: GCC < 5, Clang < 6, and MSVC < 2017 are deprecated.
+  Using these compilers will generate a compile-time warning.
+  These warnings can be suppressed by defining
+  `THRUST_IGNORE_DEPRECATED_COMPILER`.
+  Suppression is only a short term solution.
+  We will be dropping support for these compilers in the near future.
+
+### New Features
+
+- #1086: Support for NVC++ aka "Feta".
+  The most significant change is in how we use `__CUDA_ARCH__`.
+  Now, there are four macros that must be used:
+  - `THRUST_IS_DEVICE_CODE`, which should be used in an `if` statement around
+      device-only code.
+  - `THRUST_INCLUDE_DEVICE_CODE`, which should be used in an `#if` preprocessor
+      directive inside of the `if` statement mentioned in the prior bullet.
+  - `THRUST_IS_HOST_CODE`, which should be used in an `if` statement around
+      host-only code.
+  - `THRUST_INCLUDE_HOST_CODE`, which should be used in an `#if` preprocessor
+      directive inside of the `if` statement mentioned in the prior bullet.
+- #1085: `thrust::shuffle`.
+  Thanks to Rory Mitchell for this contribution.
+- #1029: `thrust::zip_function`, a facility for zipping functions that take N
+    parameters instead of a tuple of N parameters as `thrust::zip_iterator`
+    does.
+  Thanks to Ben Jude for this contribution.
+- #1068: `thrust::system::cuda::managed_memory_pointer`, a universal memory
+    strongly typed pointer compatible with the ISO C++ Standard Library.
+
+### Other Enhancements
+
+- #1029: Thrust is now built and tested with NVCC warnings treated as errors.
+- #1029: MSVC C++11 support.
+- #1029: `THRUST_DEPRECATED` abstraction for generating compile-time
+    deprecation warning messages.
+- #1029: `thrust::pointer<T>::pointer_to(reference)`.
+- #1070: Unit test for `thrust::inclusive_scan` with a user defined types.
+  Thanks to Conor Hoekstra for this contribution.
+
+### Bug Fixes
+
+- #1088: Allow `thrust::replace` to take functions that have non-`const`
+    `operator()`.
+- #1094: Add missing `constexpr` to `par_t` constructors.
+  Thanks to Patrick Stotko for this contribution.
+- #1077: Remove `__device__` from CUDA MR-based device allocators to fix
+    obscure "host function called from host device function" warning that occurs
+    when you use the new Thrust MR-based allocators.
+- #1029: Remove inconsistently-used `THRUST_BEGIN`/`END_NS` macros.
+- #1029: Fix C++ dialect detection on newer MSVC.
+- #1029 Use `_Pragma`/`__pragma` instead of `#pragma` in macros.
+- #1029: Replace raw `__cplusplus` checks with the appropriate Thrust macros.
+- #1105: Add a missing `<math.h>` include.
+- #1103: Fix regression of `thrust::detail::temporary_allocator` with non-CUDA
+    back ends.
+- #1111: Use Thrust's random number engine instead of `std::`s in device code.
+- #1108: Get rid of a GCC 9 warning about deprecated generation of copy ctors.
+
+## Thrust 1.9.8-1 (NVIDIA HPC SDK 20.3)
+
+Thrust 1.9.8-1 is a variant of 1.9.8 accompanying the NVIDIA HPC SDK 20.3
+  release.
+It contains modifications necessary to serve as the implementation of NVC++'s
+  GPU-accelerated C++17 Parallel Algorithms when using the CUDA Toolkit 11.0
+  release.
+
+## Thrust 1.9.8 (CUDA Toolkit 11.0 Early Access)
+
+Thrust 1.9.8, which is included in the CUDA Toolkit 11.0 release, removes
+  Thrust's internal derivative of CUB, upstreams all relevant changes too CUB,
+  and adds CUB as a Git submodule.
+It will now be necessary to do `git clone --recursive` when checking out
+  Thrust, and to update the CUB submodule when pulling in new Thrust changes.
+Additionally, CUB is now included as a first class citizen in the CUDA toolkit.
+Thrust 1.9.8 also fixes bugs preventing most Thrust algorithms from working
+  with more than `2^31-1` elements.
+Now, `thrust::reduce`, `thrust::*_scan`, and related algorithms (aka most of
+  Thrust) work with large element counts.
+
+### Breaking Changes
+
+- Thrust will now use the version of CUB in your include path instead of its own
+    internal copy.
+  If you are using your own version of CUB, it may be older and incompatible
+    with Thrust.
+  It is recommended to simply delete your own version of CUB and use the
+    version of CUB that comes with Thrust.
+
+### Other Enhancements
+
+- Refactor Thrust and CUB to support 64-bit indices in most algorithms.
+  In most cases, Thrust now selects between kernels that use 32-bit indices and
+    64-bit indices at runtime depending on the size of the input.
+  This means large element counts work, but small element counts do not have to
+    pay for the register usage of 64-bit indices if they are not needed.
+  Now, `thrust::reduce`, `thrust::*_scan`, and related algorithms (aka most of
+    Thrust) work with more than `2^31-1` elements.
+  Notably, `thrust::sort` is still limited to less than `2^31-1` elements.
+- CUB is now a submodule and the internal copy of CUB has been removed.
+- #1051: Stop specifying the `__launch_bounds__` minimum blocks parameter
+    because it messes up register allocation and increases register pressure,
+    and we don't actually know at compile time how many blocks we will use
+    (aside from single tile kernels).
+
+### Bug Fixes
+
+- #1020: After making a CUDA API call, always clear the global CUDA error state
+    by calling `cudaGetLastError`.
+- #1021: Avoid calling destroy in the destructor of a Thrust vector if the
+    vector is empty.
+- #1046: Actually throw `thrust::bad_alloc` when `thrust::system::cuda::malloc`
+    fails instead of just constructing a temporary and doing nothing with it.
+- Add missing copy constructor or copy assignment operator to all classes that
+    GCC 9's `-Wdeprecated-copy` complains about
+- Add missing move operations to `thrust::system::cuda::vector`.
+- #1015: Check that the backend is CUDA before using CUDA-specifics in
+    `thrust::detail::temporary_allocator`.
+  Thanks to Hugh Winkler for this contribution.
+- #1055: More correctly detect the presence of aligned/sized `new`/`delete`.
+- #1043: Fix ill-formed specialization of `thrust::system::is_error_code_enum`
+    for `thrust::event_errc`.
+  Thanks to Toru Niina for this contribution.
+- #1027: Add tests for `thrust::tuple_for_each` and `thrust::tuple_subset`.
+  Thanks to Ben Jude for this contribution.
+- #1027: Use correct macro in `thrust::tuple_for_each`.
+  Thanks to Ben Jude for this contribution.
+- #1026: Use correct MSVC version formatting in CMake.
+  Thanks to Ben Jude for this contribution.
+- Workaround an NVCC issue with type aliases with template template arguments
+    containing a parameter pack.
+- Remove unused functions from the CUDA backend which call slow CUDA attribute
+    query APIs.
+- Replace `CUB_RUNTIME_FUNCTION` with `THRUST_RUNTIME_FUNCTION`.
+- Correct typo in `thrust::transform` documentation.
+  Thanks to Eden Yefet for this contribution.
+
+### Known Issues
+
+- `thrust::sort` remains limited to `2^31-1` elements for now.
+
+## Thrust 1.9.7-1 (CUDA Toolkit 10.2 for Tegra)
+
+Thrust 1.9.7-1 is a minor release accompanying the CUDA Toolkit 10.2 release
+  for Tegra.
+It is nearly identical to 1.9.7.
+
+### Bug Fixes
+
+- Remove support for GCC's broken nodiscard-like attribute.
+
+## Thrust 1.9.7 (CUDA Toolkit 10.2)
+
+Thrust 1.9.7 is a minor release accompanying the CUDA Toolkit 10.2 release.
+Unfortunately, although the version and patch numbers are identical, one bug
+  fix present in Thrust 1.9.7 (NVBug 2646034: Fix incorrect dependency handling
+  for stream acquisition in `thrust::future`) was not included in the CUDA
+  Toolkit 10.2 preview release for AArch64 SBSA.
+The tag `cuda-10.2aarch64sbsa` contains the exact version of Thrust present
+  in the CUDA Toolkit 10.2 preview release for AArch64 SBSA.
+
+### Bug Fixes
+
+- #967, NVBug 2448170: Fix the CUDA backend `thrust::for_each` so that it
+    supports large input sizes with 64-bit indices.
+- NVBug 2646034: Fix incorrect dependency handling for stream acquisition in
+    `thrust::future`.
+  - Not present in the CUDA Toolkit 10.2 preview release for AArch64 SBSA.
+- #968, NVBug 2612102: Fix the `thrust::mr::polymorphic_adaptor` to actually
+    use its template parameter.
+
+## Thrust 1.9.6-1 (NVIDIA HPC SDK 20.3)
+
+Thrust 1.9.6-1 is a variant of 1.9.6 accompanying the NVIDIA HPC SDK 20.3
+  release.
+It contains modifications necessary to serve as the implementation of NVC++'s
+  GPU-accelerated C++17 Parallel Algorithms when using the CUDA Toolkit 10.1
+  Update 2 release.
+
+## Thrust 1.9.6 (CUDA Toolkit 10.1 Update 2)
+
+Thrust 1.9.6 is a minor release accompanying the CUDA Toolkit 10.1 Update 2
+  release.
+
+### Bug Fixes
+
+- NVBug 2509847: Inconsistent alignment of `thrust::complex`
+- NVBug 2586774: Compilation failure with Clang + older libstdc++ that doesn't
+    have `std::is_trivially_copyable`
+- NVBug 200488234: CUDA header files contain Unicode characters which leads
+    compiling errors on Windows
+- #949, #973, NVBug 2422333, NVBug 2522259, NVBug 2528822:
+    `thrust::detail::aligned_reinterpret_cast` must be annotated with
+    `__host__ __device__`.
+- NVBug 2599629: Missing include in the OpenMP sort implementation
+- NVBug 200513211: Truncation warning in test code under VC142
+
+## Thrust 1.9.5 (CUDA Toolkit 10.1 Update 1)
+
+Thrust 1.9.5 is a minor release accompanying the CUDA Toolkit 10.1 Update 1
+  release.
+
+### Bug Fixes
+
+- NVBug 2502854: Fixed assignment of
+    `thrust::device_vector<thrust::complex<T>>` between host and device.
+
+## Thrust 1.9.4 (CUDA Toolkit 10.1)
+
+Thrust 1.9.4 adds asynchronous interfaces for parallel algorithms, a new
+  allocator system including caching allocators and unified memory support, as
+  well as a variety of other enhancements, mostly related to
+  C++11/C++14/C++17/C++20 support.
+The new asynchronous algorithms in the `thrust::async` namespace return
+  `thrust::event` or `thrust::future` objects, which can be waited upon to
+  synchronize with the completion of the parallel operation.
+
+### Breaking Changes
+
+Synchronous Thrust algorithms now block until all of their operations have
+  completed.
+Use the new asynchronous Thrust algorithms for non-blocking behavior.
+
+### New Features
+
+- `thrust::event` and `thrust::future<T>`, uniquely-owned asynchronous handles
+    consisting of a state (ready or not ready), content (some value; for
+    `thrust::future` only), and an optional set of objects that should be
+    destroyed only when the future's value is ready and has been consumed.
+  - The design is loosely based on C++11's `std::future`.
+  - They can be `.wait`'d on, and the value of a future can be waited on and
+      retrieved with `.get` or `.extract`.
+  - Multiple `thrust::event`s and `thrust::future`s can be combined with
+      `thrust::when_all`.
+  - `thrust::future`s can be converted to `thrust::event`s.
+  - Currently, these primitives are only implemented for the CUDA backend and
+      are C++11 only.
+- New asynchronous algorithms that return `thrust::event`/`thrust::future`s,
+    implemented as C++20 range style customization points:
+    - `thrust::async::reduce`.
+    - `thrust::async::reduce_into`, which takes a target location to store the
+        reduction result into.
+    - `thrust::async::copy`, including a two-policy overload that allows
+        explicit cross system copies which execution policy properties can be
+        attached to.
+    - `thrust::async::transform`.
+    - `thrust::async::for_each`.
+    - `thrust::async::stable_sort`.
+    - `thrust::async::sort`.
+    - By default the asynchronous algorithms use the new caching allocators.
+        Deallocation of temporary storage is deferred until the destruction of
+        the returned `thrust::future`. The content of `thrust::future`s is
+        stored in either device or universal memory and transferred to the host
+        only upon request to prevent unnecessary data migration.
+    - Asynchronous algorithms are currently only implemented for the CUDA
+        system and are C++11 only.
+- `exec.after(f, g, ...)`, a new execution policy method that takes a set of
+    `thrust::event`/`thrust::future`s and returns an execution policy that
+    operations on that execution policy should depend upon.
+- New logic and mindset for the type requirements for cross-system sequence
+    copies (currently only used by `thrust::async::copy`), based on:
+  - `thrust::is_contiguous_iterator` and `THRUST_PROCLAIM_CONTIGUOUS_ITERATOR`
+      for detecting/indicating that an iterator points to contiguous storage.
+  - `thrust::is_trivially_relocatable` and
+      `THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE` for detecting/indicating that a
+      type is `memcpy`able (based on principles from
+      [P1144](https://wg21.link/P1144)).
+  - The new approach reduces buffering, increases performance, and increases
+      correctness.
+  - The fast path is now enabled when copying CUDA `__half` and vector types with
+      `thrust::async::copy`.
+- All Thrust synchronous algorithms for the CUDA backend now actually
+    synchronize. Previously, any algorithm that did not allocate temporary
+    storage (counterexample: `thrust::sort`) and did not have a
+    computation-dependent result (counterexample: `thrust::reduce`) would
+    actually be launched asynchronously. Additionally, synchronous algorithms
+    that allocated temporary storage would become asynchronous if a custom
+    allocator was supplied that did not synchronize on allocation/deallocation,
+    unlike `cudaMalloc`/`cudaFree`. So, now `thrust::for_each`,
+    `thrust::transform`, `thrust::sort`, etc are truly synchronous. In some
+    cases this may be a performance regression; if you need asynchrony, use the
+    new asynchronous algorithms.
+- Thrust's allocator framework has been rewritten. It now uses a memory
+    resource system, similar to C++17's `std::pmr` but supporting static
+    polymorphism. Memory resources are objects that allocate untyped storage and
+    allocators are cheap handles to memory resources in this new model. The new
+    facilities live in `<thrust/mr/*>`.
+  - `thrust::mr::memory_resource<Pointer>`, the memory resource base class,
+      which takes a (possibly tagged) pointer to `void` type as a parameter.
+  - `thrust::mr::allocator<T, MemoryResource>`, an allocator backed by a memory
+      resource object.
+  - `thrust::mr::polymorphic_adaptor_resource<Pointer>`, a type-erased memory
+      resource adaptor.
+  - `thrust::mr::polymorphic_allocator<T>`, a C++17-style polymorphic allocator
+      backed by a type-erased memory resource object.
+  - New tunable C++17-style caching memory resources,
+      `thrust::mr::(disjoint_)?(un)?synchronized_pool_resource`, designed to
+      cache both small object allocations and large repetitive temporary
+      allocations. The disjoint variants use separate storage for management of
+      the pool, which is necessary if the memory being allocated cannot be
+      accessed on the host (e.g.  device memory).
+  - System-specific allocators were rewritten to use the new memory resource
+      framework.
+  - New `thrust::device_memory_resource` for allocating device memory.
+  - New `thrust::universal_memory_resource` for allocating memory that can be
+      accessed from both the host and device (e.g. `cudaMallocManaged`).
+  - New `thrust::universal_host_pinned_memory_resource` for allocating memory
+      that can be accessed from the host and the device but always resides in
+      host memory (e.g. `cudaMallocHost`).
+  - `thrust::get_per_device_resource` and `thrust::per_device_allocator`, which
+      lazily create and retrieve a per-device singleton memory resource.
+  - Rebinding mechanisms (`rebind_traits` and `rebind_alloc`) for
+      `thrust::allocator_traits`.
+  - `thrust::device_make_unique`, a factory function for creating a
+      `std::unique_ptr` to a newly allocated object in device memory.
+  - `<thrust/detail/memory_algorithms>`, a C++11 implementation of the C++17
+      uninitialized memory algorithms.
+  - `thrust::allocate_unique` and friends, based on the proposed C++23
+      [`std::allocate_unique`](https://wg21.link/P0211).
+- New type traits and metaprogramming facilities. Type traits are slowly being
+    migrated out of `thrust::detail::` and `<thrust/detail/*>`; their new home
+    will be `thrust::` and `<thrust/type_traits/*>`.
+  - `thrust::is_execution_policy`.
+  - `thrust::is_operator_less_or_greater_function_object`, which detects
+      `thrust::less`, `thrust::greater`, `std::less`, and `std::greater`.
+  - `thrust::is_operator_plus_function_object``, which detects `thrust::plus`
+      and `std::plus`.
+  - `thrust::remove_cvref(_t)?`, a C++11 implementation of C++20's
+      `thrust::remove_cvref(_t)?`.
+  - `thrust::void_t`, and various other new type traits.
+  - `thrust::integer_sequence` and friends, a C++11 implementation of C++20's
+      `std::integer_sequence`
+  - `thrust::conjunction`, `thrust::disjunction`, and `thrust::disjunction`, a
+      C++11 implementation of C++17's logical metafunctions.
+  - Some Thrust type traits (such as `thrust::is_constructible`) have been
+      redefined in terms of C++11's type traits when they are available.
+- `<thrust/detail/tuple_algorithms.h>`, new `std::tuple` algorithms:
+  - `thrust::tuple_transform`.
+  - `thrust::tuple_for_each`.
+  - `thrust::tuple_subset`.
+- Miscellaneous new `std::`-like facilities:
+  - `thrust::optional`, a C++11 implementation of C++17's `std::optional`.
+  - `thrust::addressof`, an implementation of C++11's `std::addressof`.
+  - `thrust::next` and `thrust::prev`, an implementation of C++11's `std::next`
+      and `std::prev`.
+  - `thrust::square`, a `<functional>` style unary function object that
+      multiplies its argument by itself.
+  - `<thrust/limits.h>` and `thrust::numeric_limits`, a customized version of
+      `<limits>` and `std::numeric_limits`.
+- `<thrust/detail/preprocessor.h>`, new general purpose preprocessor facilities:
+  - `THRUST_PP_CAT[2-5]`, concatenates two to five tokens.
+  - `THRUST_PP_EXPAND(_ARGS)?`, performs double expansion.
+  - `THRUST_PP_ARITY` and `THRUST_PP_DISPATCH`, tools for macro overloading.
+  - `THRUST_PP_BOOL`, boolean conversion.
+  - `THRUST_PP_INC` and `THRUST_PP_DEC`, increment/decrement.
+  - `THRUST_PP_HEAD`, a variadic macro that expands to the first argument.
+  - `THRUST_PP_TAIL`, a variadic macro that expands to all its arguments after
+      the first.
+  - `THRUST_PP_IIF`, bitwise conditional.
+  - `THRUST_PP_COMMA_IF`, and `THRUST_PP_HAS_COMMA`, facilities for adding and
+      detecting comma tokens.
+  - `THRUST_PP_IS_VARIADIC_NULLARY`, returns true if called with a nullary
+      `__VA_ARGS__`.
+  - `THRUST_CURRENT_FUNCTION`, expands to the name of the current function.
+- New C++11 compatibility macros:
+  - `THRUST_NODISCARD`, expands to `[[nodiscard]]` when available and the best
+      equivalent otherwise.
+  - `THRUST_CONSTEXPR`, expands to `constexpr` when available and the best
+      equivalent otherwise.
+  - `THRUST_OVERRIDE`, expands to `override` when available and the best
+      equivalent otherwise.
+  - `THRUST_DEFAULT`, expands to `= default;` when available and the best
+      equivalent otherwise.
+  - `THRUST_NOEXCEPT`, expands to `noexcept` when available and the best
+      equivalent otherwise.
+  - `THRUST_FINAL`, expands to `final` when available and the best equivalent
+      otherwise.
+  - `THRUST_INLINE_CONSTANT`, expands to `inline constexpr` when available and
+      the best equivalent otherwise.
+- `<thrust/detail/type_deduction.h>`, new C++11-only type deduction helpers:
+  - `THRUST_DECLTYPE_RETURNS*`, expand to function definitions with suitable
+      conditional `noexcept` qualifiers and trailing return types.
+  - `THRUST_FWD(x)`, expands to `::std::forward<decltype(x)>(x)`.
+  - `THRUST_MVCAP`, expands to a lambda move capture.
+  - `THRUST_RETOF`, expands to a decltype computing the return type of an
+      invocable.
+- New CMake build system.
+
+### New Examples
+
+- `mr_basic` demonstrates how to use the new memory resource allocator system.
+
+### Other Enhancements
+
+- Tagged pointer enhancements:
+  - New `thrust::pointer_traits` specialization for `void const*`.
+  - `nullptr` support to Thrust tagged pointers.
+  - New `explicit operator bool` for Thrust tagged pointers when using C++11
+      for `std::unique_ptr` interoperability.
+  - Added `thrust::reinterpret_pointer_cast` and `thrust::static_pointer_cast`
+      for casting Thrust tagged pointers.
+- Iterator enhancements:
+  - `thrust::iterator_system` is now SFINAE friendly.
+  - Removed cv qualifiers from iterator types when using
+      `thrust::iterator_system`.
+- Static assert enhancements:
+  - New `THRUST_STATIC_ASSERT_MSG`, takes an optional string constant to be
+      used as the error message when possible.
+  - Update `THRUST_STATIC_ASSERT(_MSG)` to use C++11's `static_assert` when
+      it's available.
+  - Introduce a way to test for static assertions.
+- Testing enhancements:
+  - Additional scalar and sequence types, including non-builtin types and
+      vectors with unified memory allocators, have been added to the list of
+      types used by generic unit tests.
+  - The generation of random input data has been improved to increase the range
+      of values used and catch more corner cases.
+  - New `unittest::truncate_to_max_representable` utility for avoiding the
+      generation of ranges that cannot be represented by the underlying element
+      type in generic unit test code.
+  - The test driver now synchronizes with CUDA devices and check for errors
+      after each test, when switching devices, and after each raw kernel launch.
+  - The `warningtester` uber header is now compiled with NVCC to avoid needing
+      to disable CUDA-specific code with the preprocessor.
+  - Fixed the unit test framework's `ASSERT_*` to print `char`s as `int`s.
+  - New `DECLARE_INTEGRAL_VARIABLE_UNITTEST` test declaration macro.
+  - New `DECLARE_VARIABLE_UNITTEST_WITH_TYPES_AND_NAME` test declaration macro.
+  - `thrust::system_error` in the CUDA backend now print out its `cudaError_t`
+      enumerator in addition to the diagnostic message.
+  - Stopped using conditionally signed types like `char`.
+
+### Bug Fixes
+
+- #897, NVBug 2062242: Fix compilation error when using `__device__` lambdas
+    with `thrust::reduce` on MSVC.
+- #908, NVBug 2089386: Static assert that `thrust::generate`/`thrust::fill`
+    isn't operating on const iterators.
+- #919 Fix compilation failure with `thrust::zip_iterator` and
+    `thrust::complex`.
+- #924, NVBug 2096679, NVBug 2315990: Fix dispatch for the CUDA backend's
+    `thrust::reduce` to use two functions (one with the pragma for disabling
+    exec checks, one with `THRUST_RUNTIME_FUNCTION`) instead of one. This fixes
+    a regression with device compilation that started in CUDA Toolkit 9.2.
+- #928, NVBug 2341455: Add missing `__host__ __device__` annotations to a
+    `thrust::complex::operator=` to satisfy GoUDA.
+- NVBug 2094642: Make `thrust::vector_base::clear` not depend on the element
+    type being default constructible.
+- NVBug 2289115: Remove flaky `simple_cuda_streams` example.
+- NVBug 2328572: Add missing `thrust::device_vector` constructor that takes an
+    allocator parameter.
+- NVBug 2455740: Update the `range_view` example to not use device-side launch.
+- NVBug 2455943: Ensure that sized unit tests that use
+    `thrust::counting_iterator` perform proper truncation.
+- NVBug 2455952: Refactor questionable `thrust::copy_if` unit tests.
+
+## Thrust 1.9.3 (CUDA Toolkit 10.0)
+
+Thrust 1.9.3 unifies and integrates CUDA Thrust and GitHub Thrust.
+
+### Bug Fixes
+
+- #725, #850, #855, #859, #860: Unify the `thrust::iter_swap` interface and fix
+    `thrust::device_reference` swapping.
+- NVBug 2004663: Add a `data` method to `thrust::detail::temporary_array` and
+    refactor temporary memory allocation in the CUDA backend to be exception
+    and leak safe.
+- #886, #894, #914: Various documentation typo fixes.
+- #724: Provide `NVVMIR_LIBRARY_DIR` environment variable to NVCC.
+- #878: Optimize `thrust::min/max_element` to only use
+    `thrust::detail::get_iterator_value` for non-numeric types.
+- #899: Make `thrust::cuda::experimental::pinned_allocator`'s comparison
+    operators `const`.
+- NVBug 2092152: Remove all includes of `<cuda.h>`.
+- #911: Fix default comparator element type for `thrust::merge_by_key`.
+
+### Acknowledgments
+
+- Thanks to Andrew Corrigan for contributing fixes for swapping interfaces.
+- Thanks to Francisco Facioni for contributing optimizations for
+    `thrust::min/max_element`.
+
+## Thrust 1.9.2 (CUDA Toolkit 9.2)
+
+Thrust 1.9.2 brings a variety of performance enhancements, bug fixes and test
+  improvements.
+CUB 1.7.5 was integrated, enhancing the performance of `thrust::sort` on
+  small data types and `thrust::reduce`.
+Changes were applied to `complex` to optimize memory access.
+Thrust now compiles with compiler warnings enabled and treated as errors.
+Additionally, the unit test suite and framework was enhanced to increase
+  coverage.
+
+### Breaking Changes
+
+- The `fallback_allocator` example was removed, as it was buggy and difficult
+    to support.
+
+### New Features
+
+- `<thrust/detail/alignment.h>`, utilities for memory alignment:
+  - `thrust::aligned_reinterpret_cast`.
+  - `thrust::aligned_storage_size`, which computes the amount of storage needed
+      for an object of a particular size and alignment.
+  - `thrust::alignment_of`, a C++03 implementation of C++11's
+      `std::alignment_of`.
+  - `thrust::aligned_storage`, a C++03 implementation of C++11's
+      `std::aligned_storage`.
+  - `thrust::max_align_t`, a C++03 implementation of C++11's
+      `std::max_align_t`.
+
+### Bug Fixes
+
+- NVBug 200385527, NVBug 200385119, NVBug 200385113, NVBug 200349350, NVBug
+    2058778: Various compiler warning issues.
+- NVBug 200355591: `thrust::reduce` performance issues.
+- NVBug 2053727: Fixed an ADL bug that caused user-supplied `allocate` to be
+    overlooked but `deallocate` to be called with GCC <= 4.3.
+- NVBug 1777043: Fixed `thrust::complex` to work with `thrust::sequence`.
+
+## Thrust 1.9.1-2 (CUDA Toolkit 9.1)
+
+Thrust 1.9.1-2 integrates version 1.7.4 of CUB and introduces a new CUDA backend
+  for `thrust::reduce` based on CUB.
+
+### Bug Fixes
+
+- NVBug 1965743: Remove unnecessary static qualifiers.
+- NVBug 1940974: Fix regression causing a compilation error when using
+    `thrust::merge_by_key` with `thrust::constant_iterator`s.
+- NVBug 1904217: Allow callables that take non-const refs to be used with
+    `thrust::reduce` and `thrust::*_scan`.
+
+## Thrust 1.9.0-5 (CUDA Toolkit 9.0)
+
+Thrust 1.9.0-5 replaces the original CUDA backend (bulk) with a new one
+  written using CUB, a high performance CUDA collectives library.
+This brings a substantial performance improvement to the CUDA backend across
+  the board.
+
+### Breaking Changes
+
+- Any code depending on CUDA backend implementation details will likely be
+    broken.
+
+### New Features
+
+- New CUDA backend based on CUB which delivers substantially higher performance.
+- `thrust::transform_output_iterator`, a fancy iterator that applies a function
+    to the output before storing the result.
+
+### New Examples
+
+- `transform_output_iterator` demonstrates use of the new fancy iterator
+    `thrust::transform_output_iterator`.
+
+### Other Enhancements
+
+- When C++11 is enabled, functors do not have to inherit from
+    `thrust::(unary|binary)_function` anymore to be used with
+    `thrust::transform_iterator`.
+- Added C++11 only move constructors and move assignment operators for
+    `thrust::detail::vector_base`-based classes, e.g. `thrust::host_vector`,
+    `thrust::device_vector`, and friends.
+
+### Bug Fixes
+
+- `sin(thrust::complex<double>)` no longer has precision loss to float.
+
+### Acknowledgments
+
+- Thanks to Manuel Schiller for contributing a C++11 based enhancement
+    regarding the deduction of functor return types, improving the performance
+    of `thrust::unique` and implementing `thrust::transform_output_iterator`.
+- Thanks to Thibault Notargiacomo for the implementation of move semantics for
+    the `thrust::vector_base`-based classes.
+- Thanks to Duane Merrill for developing CUB and helping to integrate it into
+    Thrust's backend.
+
+## Thrust 1.8.3 (CUDA Toolkit 8.0)
+
+Thrust 1.8.3 is a small bug fix release.
+
+### New Examples
+
+- `range_view` demonstrates the use of a view (a non-owning wrapper for an
+    iterator range with a container-like interface).
+
+### Bug Fixes
+
+- `thrust::(min|max|minmax)_element` can now accept raw device pointers when
+    an explicit device execution policy is used.
+- `thrust::clear` operations on vector types no longer requires the element
+    type to have a default constructor.
+
+## Thrust 1.8.2 (CUDA Toolkit 7.5)
+
+Thrust 1.8.2 is a small bug fix release.
+
+### Bug Fixes
+
+- Avoid warnings and errors concerning user functions called from
+    `__host__ __device__` functions.
+- #632: Fix an error in `thrust::set_intersection_by_key` with the CUDA backend.
+- #651: `thrust::copy` between host and device now accepts execution policies
+    with streams attached, i.e. `thrust::::cuda::par.on(stream)`.
+- #664: `thrust::for_each` and algorithms based on it no longer ignore streams
+    attached to execution policys.
+
+### Known Issues
+
+- #628: `thrust::reduce_by_key` for the CUDA backend fails for Compute
+    Capability 5.0 devices.
+
+## Thrust 1.8.1 (CUDA Toolkit 7.0)
+
+Thrust 1.8.1 is a small bug fix release.
+
+### Bug Fixes
+
+- #615, #620: Fixed `thrust::for_each` and `thrust::reduce` to no longer fail on
+    large inputs.
+
+### Known Issues
+
+- #628: `thrust::reduce_by_key` for the CUDA backend fails for Compute
+    Capability 5.0 devices.
+
+## Thrust 1.8.0
+
+Thrust 1.8.0 introduces support for algorithm invocation from CUDA device
+  code, support for CUDA streams, and algorithm performance improvements.
+Users may now invoke Thrust algorithms from CUDA device code, providing a
+  parallel algorithms library to CUDA programmers authoring custom kernels, as
+  well as allowing Thrust programmers to nest their algorithm calls within
+  functors.
+The `thrust::seq` execution policy allows users to require sequential algorithm
+  execution in the calling thread and makes a sequential algorithms library
+  available to individual CUDA threads.
+The `.on(stream)` syntax allows users to request a CUDA stream for kernels
+  launched during algorithm execution.
+Finally, new CUDA algorithm implementations provide substantial performance
+  improvements.
+
+### New Features
+
+- Algorithms in CUDA Device Code:
+    - Thrust algorithms may now be invoked from CUDA `__device__` and
+        `__host__` __device__ functions.
+      Algorithms invoked in this manner must be invoked with an execution
+        policy as the first parameter.
+      The following execution policies are supported in CUDA __device__ code:
+      - `thrust::seq`
+      - `thrust::cuda::par`
+      - `thrust::device`, when THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA.
+  - Device-side algorithm execution may not be parallelized unless CUDA Dynamic
+      Parallelism is available.
+- Execution Policies:
+  - CUDA Streams
+    - The `thrust::cuda::par.on(stream)` syntax allows users to request that
+        CUDA kernels launched during algorithm execution should occur on a given
+        stream.
+    - Algorithms executed with a CUDA stream in this manner may still
+        synchronize with other streams when allocating temporary storage or
+        returning results to the CPU.
+  - `thrust::seq`, which allows users to require that an algorithm execute
+      sequentially in the calling thread.
+- `thrust::complex`, a complex number data type.
+
+### New Examples
+
+- simple_cuda_streams demonstrates how to request a CUDA stream during
+    algorithm execution.
+- async_reduce demonstrates ways to achieve algorithm invocations which are
+    asynchronous with the calling thread.
+
+### Other Enhancements
+
+- CUDA sort performance for user-defined types is 300% faster on Tesla K20c for
+    large problem sizes.
+- CUDA merge performance is 200% faster on Tesla K20c for large problem sizes.
+- CUDA sort performance for primitive types is 50% faster on Tesla K20c for
+    large problem sizes.
+- CUDA reduce_by_key performance is 25% faster on Tesla K20c for large problem
+    sizes.
+- CUDA scan performance is 15% faster on Tesla K20c for large problem sizes.
+- fallback_allocator example is simpler.
+
+### Bug Fixes
+
+- #364: Iterators with unrelated system tags may be used with algorithms invoked
+    with an execution policy
+- #371: Do not redefine `__CUDA_ARCH__`.
+- #379: Fix crash when dereferencing transform_iterator on the host.
+- #391: Avoid use of uppercase variable names.
+- #392: Fix `thrust::copy` between `cusp::complex` and `std::complex`.
+- #396: Program compiled with gcc < 4.3 hangs during comparison sort.
+- #406: `fallback_allocator.cu` example checks device for unified addressing support.
+- #417: Avoid using `std::less<T>` in binary search algorithms.
+- #418: Avoid various warnings.
+- #443: Including version.h no longer configures default systems.
+- #578: NVCC produces warnings when sequential algorithms are used with CPU systems.
+
+### Known Issues
+
+- When invoked with primitive data types, thrust::sort, thrust::sort_by_key,
+    thrust::stable_sort, & thrust::stable_sort_by_key may
+- Sometimes linking fails when compiling with `-rdc=true` with NVCC.
+- The CUDA implementation of thrust::reduce_by_key incorrectly outputs the last
+    element in a segment of equivalent keys instead of the first.
+
+### Acknowledgments
+
+- Thanks to Sean Baxter for contributing faster CUDA reduce, merge, and scan
+    implementations.
+- Thanks to Duane Merrill for contributing a faster CUDA radix sort implementation.
+- Thanks to Filipe Maia for contributing the implementation of thrust::complex.
+
+## Thrust 1.7.2 (CUDA Toolkit 6.5)
+
+Thrust 1.7.2 is a minor bug fix release.
+
+### Bug Fixes
+
+- Avoid use of `std::min` in generic find implementation.
+
+## Thrust 1.7.1 (CUDA Toolkit 6.0)
+
+Thrust 1.7.1 is a minor bug fix release.
+
+### Bug Fixes
+
+- Eliminate identifiers in `set_operations.cu` example with leading underscore.
+- Eliminate unused variable warning in CUDA `reduce_by_key` implementation.
+- Avoid deriving function objects from `std::unary_function` and
+    `std::binary_function`.
+
+## Thrust 1.7.0 (CUDA Toolkit 5.5)
+
+Thrust 1.7.0 introduces a new interface for controlling algorithm execution as
+  well as several new algorithms and performance improvements.
+With this new interface, users may directly control how algorithms execute as
+  well as details such as the allocation of temporary storage.
+Key/value versions of thrust::merge and the set operation algorithms have been
+  added, as well stencil versions of partitioning algorithms.
+thrust::tabulate has been introduced to tabulate the values of functions taking
+  integers.
+For 32b types, new CUDA merge and set operations provide 2-15x faster
+  performance while a new CUDA comparison sort provides 1.3-4x faster
+  performance.
+Finally, a new TBB reduce_by_key implementation provides 80% faster
+  performance.
+
+### Breaking Changes
+
+- Dispatch:
+  - Custom user backend systems' tag types must now inherit from the
+      corresponding system's execution_policy template (e.g.
+      thrust::cuda::execution_policy) instead of the tag struct (e.g.
+      thrust::cuda::tag). Otherwise, algorithm specializations will silently go
+      unfound during dispatch. See examples/minimal_custom_backend.cu and
+      examples/cuda/fallback_allocator.cu for usage examples.
+  - thrust::advance and thrust::distance are no longer dispatched based on
+      iterator system type and thus may no longer be customized.
+- Iterators:
+  - iterator_facade and iterator_adaptor's Pointer template parameters have
+      been eliminated.
+  - iterator_adaptor has been moved into the thrust namespace (previously
+      thrust::experimental::iterator_adaptor).
+  - iterator_facade has been moved into the thrust namespace (previously
+      thrust::experimental::iterator_facade).
+  - iterator_core_access has been moved into the thrust namespace (previously
+      thrust::experimental::iterator_core_access).
+  - All iterators' nested pointer typedef (the type of the result of
+      operator->) is now void instead of a pointer type to indicate that such
+      expressions are currently impossible.
+  - Floating point counting_iterators' nested difference_type typedef is now a
+      signed integral type instead of a floating point type.
+- Other:
+  - normal_distribution has been moved into the thrust::random namespace
+      (previously thrust::random::experimental::normal_distribution).
+  - Placeholder expressions may no longer include the comma operator.
+
+### New Features
+- Execution Policies:
+  - Users may directly control the dispatch of algorithm invocations with
+      optional execution policy arguments.
+    For example, instead of wrapping raw pointers allocated by cudaMalloc with
+      thrust::device_ptr, the thrust::device execution_policy may be passed as
+      an argument to an algorithm invocation to enable CUDA execution.
+  - The following execution policies are supported in this version:
+    - `thrust::host`
+    - `thrust::device`
+    - `thrust::cpp::par`
+    - `thrust::cuda::par`
+    - `thrust::omp::par`
+    - `thrust::tbb::par`
+- Algorithms:
+  - `thrust::merge_by_key`
+  - `thrust::partition` with stencil
+  - `thrust::partition_copy` with stencil
+  - `thrust::set_difference_by_key`
+  - `thrust::set_intersection_by_key`
+  - `thrust::set_symmetric_difference_by_key`
+  - `thrust::set_union_by_key`
+  - `thrust::stable_partition with stencil`
+  - `thrust::stable_partition_copy with stencil`
+  - `thrust::tabulate`
+- Memory Allocation:
+	- `thrust::malloc`
+	- `thrust::free`
+  - `thrust::get_temporary_buffer`
+  - `thrust::return_temporary_buffer`
+
+### New Examples
+
+- uninitialized_vector demonstrates how to use a custom allocator to avoid the
+    automatic initialization of elements in thrust::device_vector.
+
+### Other Enhancements
+
+- Authors of custom backend systems may manipulate arbitrary state during
+    algorithm dispatch by incorporating it into their execution_policy parameter.
+- Users may control the allocation of temporary storage during algorithm
+    execution by passing standard allocators as parameters via execution policies
+    such as thrust::device.
+- THRUST_DEVICE_SYSTEM_CPP has been added as a compile-time target for the
+    device backend.
+- CUDA merge performance is 2-15x faster.
+- CUDA comparison sort performance is 1.3-4x faster.
+- CUDA set operation performance is 1.5-15x faster.
+- TBB reduce_by_key performance is 80% faster.
+- Several algorithms have been parallelized with TBB.
+- Support for user allocators in vectors has been improved.
+- The sparse_vector example is now implemented with merge_by_key instead of
+    sort_by_key.
+- Warnings have been eliminated in various contexts.
+- Warnings about __host__ or __device__-only functions called from __host__
+    __device__ functions have been eliminated in various contexts.
+- Documentation about algorithm requirements have been improved.
+- Simplified the minimal_custom_backend example.
+- Simplified the cuda/custom_temporary_allocation example.
+- Simplified the cuda/fallback_allocator example.
+
+### Bug Fixes
+
+- #248: Fix broken `thrust::counting_iterator<float>` behavior with OpenMP.
+- #231, #209: Fix set operation failures with CUDA.
+- #187: Fix incorrect occupancy calculation with CUDA.
+- #153: Fix broken multi GPU behavior with CUDA.
+- #142: Eliminate warning produced by `thrust::random::taus88` and MSVC 2010.
+- #208: Correctly initialize elements in temporary storage when necessary.
+- #16: Fix compilation error when sorting bool with CUDA.
+- #10: Fix ambiguous overloads of `thrust::reinterpret_tag`.
+
+### Known Issues
+
+- GCC 4.3 and lower may fail to dispatch thrust::get_temporary_buffer correctly
+    causing infinite recursion in examples such as
+    cuda/custom_temporary_allocation.
+
+### Acknowledgments
+
+- Thanks to Sean Baxter, Bryan Catanzaro, and Manjunath Kudlur for contributing
+    a faster merge implementation for CUDA.
+- Thanks to Sean Baxter for contributing a faster set operation implementation
+    for CUDA.
+- Thanks to Cliff Woolley for contributing a correct occupancy calculation
+    algorithm.
+
+## Thrust 1.6.0
+
+Thrust 1.6.0 provides an interface for customization and extension and a new
+  backend system based on the Threading Building Blocks library.
+With this new interface, programmers may customize the behavior of specific
+  algorithms as well as control the allocation of temporary storage or invent
+  entirely new backends.
+These enhancements also allow multiple different backend systems
+  such as CUDA and OpenMP to coexist within a single program.
+Support for TBB allows Thrust programs to integrate more naturally into
+  applications which may already employ the TBB task scheduler.
+
+### Breaking Changes
+
+- The header <thrust/experimental/cuda/pinned_allocator.h> has been moved to
+    <thrust/system/cuda/experimental/pinned_allocator.h>
+- thrust::experimental::cuda::pinned_allocator has been moved to
+    thrust::cuda::experimental::pinned_allocator
+- The macro THRUST_DEVICE_BACKEND has been renamed THRUST_DEVICE_SYSTEM
+- The macro THRUST_DEVICE_BACKEND_CUDA has been renamed THRUST_DEVICE_SYSTEM_CUDA
+- The macro THRUST_DEVICE_BACKEND_OMP has been renamed THRUST_DEVICE_SYSTEM_OMP
+- thrust::host_space_tag has been renamed thrust::host_system_tag
+- thrust::device_space_tag has been renamed thrust::device_system_tag
+- thrust::any_space_tag has been renamed thrust::any_system_tag
+- thrust::iterator_space has been renamed thrust::iterator_system
+
+### New Features
+
+- Backend Systems
+  - Threading Building Blocks (TBB) is now supported
+- Algorithms
+  - `thrust::for_each_n`
+  - `thrust::raw_reference_cast`
+- Types
+  - `thrust::pointer`
+  - `thrust::reference`
+
+### New Examples
+
+- `cuda/custom_temporary_allocation`
+- `cuda/fallback_allocator`
+- `device_ptr`
+- `expand`
+- `minimal_custom_backend`
+- `raw_reference_cast`
+- `set_operations`
+
+### Other Enhancements
+
+- `thrust::for_each` now returns the end of the input range similar to most
+    other algorithms.
+- `thrust::pair` and `thrust::tuple` have swap functionality.
+- All CUDA algorithms now support large data types.
+- Iterators may be dereferenced in user `__device__` or `__global__` functions.
+- The safe use of different backend systems is now possible within a single
+  binary
+
+### Bug Fixes
+
+- #469 `min_element` and `max_element` algorithms no longer require a const comparison operator
+
+### Known Issues
+
+- NVCC may crash when parsing TBB headers on Windows.
+
+## Thrust 1.5.3 (CUDA Toolkit 5.0)
+
+Thrust 1.5.3 is a minor bug fix release.
+
+### Bug Fixes
+
+- Avoid warnings about potential race due to `__shared__` non-POD variable
+
+## Thrust 1.5.2 (CUDA Toolkit 4.2)
+
+Thrust 1.5.2 is a minor bug fix release.
+
+### Bug Fixes
+
+- Fixed warning about C-style initialization of structures
+
+## Thrust 1.5.1 (CUDA Toolkit 4.1)
+
+Thrust 1.5.1 is a minor bug fix release.
+
+### Bug Fixes
+
+- Sorting data referenced by permutation_iterators on CUDA produces invalid results
+
+## Thrust 1.5.0
+
+Thrust 1.5.0 provides introduces new programmer productivity and performance
+  enhancements.
+New functionality for creating anonymous "lambda" functions has been added.
+A faster host sort provides 2-10x faster performance for sorting arithmetic
+  types on (single-threaded) CPUs.
+A new OpenMP sort provides 2.5x-3.0x speedup over the host sort using a
+  quad-core CPU.
+When sorting arithmetic types with the OpenMP backend the combined performance
+  improvement is 5.9x for 32-bit integers and ranges from 3.0x (64-bit types) to
+  14.2x (8-bit types).
+A new CUDA `reduce_by_key` implementation provides 2-3x faster
+  performance.
+
+### Breaking Changes
+- device_ptr<void> no longer unsafely converts to device_ptr<T> without an
+    explicit cast.
+  Use the expression device_pointer_cast(static_cast<int*>(void_ptr.get())) to
+    convert, for example, device_ptr<void> to device_ptr<int>.
+
+### New Features
+
+- Algorithms:
+  - Stencil-less `thrust::transform_if`.
+- Lambda placeholders
+
+### New Examples
+- lambda
+
+### Other Enhancements
+
+- Host sort is 2-10x faster for arithmetic types
+- OMP sort provides speedup over host sort
+- `reduce_by_key` is 2-3x faster
+- `reduce_by_key` no longer requires O(N) temporary storage
+- CUDA scan algorithms are 10-40% faster
+- `host_vector` and `device_vector` are now documented
+- out-of-memory exceptions now provide detailed information from CUDART
+- improved histogram example
+- `device_reference` now has a specialized swap
+- `reduce_by_key` and scan algorithms are compatible with `discard_iterator`
+
+### Bug Fixes
+
+- #44: Allow `thrust::host_vector` to compile when `value_type` uses
+    `__align__`.
+- #198: Allow `thrust::adjacent_difference` to permit safe in-situ operation.
+- #303: Make thrust thread-safe.
+- #313: Avoid race conditions in `thrust::device_vector::insert`.
+- #314: Avoid unintended ADL invocation when dispatching copy.
+- #365: Fix merge and set operation failures.
+
+### Known Issues
+
+- None
+
+### Acknowledgments
+
+- Thanks to Manjunath Kudlur for contributing his Carbon library, from which
+    the lambda functionality is derived.
+- Thanks to Jean-Francois Bastien for suggesting a fix for #303.
+
+## Thrust 1.4.0 (CUDA Toolkit 4.0)
+
+Thrust 1.4.0 is the first release of Thrust to be included in the CUDA Toolkit.
+Additionally, it brings many feature and performance improvements.
+New set theoretic algorithms operating on sorted sequences have been added.
+Additionally, a new fancy iterator allows discarding redundant or otherwise
+  unnecessary output from algorithms, conserving memory storage and bandwidth.
+
+### Breaking Changes
+
+- Eliminations
+  - `thrust/is_sorted.h`
+  - `thrust/utility.h`
+  - `thrust/set_intersection.h`
+  - `thrust/experimental/cuda/ogl_interop_allocator.h` and the functionality
+      therein
+  - `thrust::deprecated::copy_when`
+  - `thrust::deprecated::absolute_value`
+  - `thrust::deprecated::copy_when`
+  - `thrust::deprecated::absolute_value`
+  - `thrust::deprecated::copy_when`
+  - `thrust::deprecated::absolute_value`
+  - `thrust::gather` and `thrust::scatter` from host to device and vice versa
+      are no longer supported.
+  - Operations which modify the elements of a thrust::device_vector are no longer
+      available from source code compiled without nvcc when the device backend
+      is CUDA.
+    Instead, use the idiom from the cpp_interop example.
+
+### New Features
+
+- Algorithms:
+  - `thrust::copy_n`
+  - `thrust::merge`
+  - `thrust::set_difference`
+  - `thrust::set_symmetric_difference`
+  - `thrust::set_union`
+
+- Types
+  - `thrust::discard_iterator`
+
+- Device Support:
+  - Compute Capability 2.1 GPUs.
+
+### New Examples
+
+- run_length_decoding
+
+### Other Enhancements
+
+- Compilation warnings are substantially reduced in various contexts.
+- The compilation time of thrust::sort, thrust::stable_sort,
+    thrust::sort_by_key, and thrust::stable_sort_by_key are substantially
+    reduced.
+- A fast sort implementation is used when sorting primitive types with
+    thrust::greater.
+- The performance of thrust::set_intersection is improved.
+- The performance of thrust::fill is improved on SM 1.x devices.
+- A code example is now provided in each algorithm's documentation.
+- thrust::reverse now operates in-place
+
+### Bug Fixes
+
+- #212: `thrust::set_intersection` works correctly for large input sizes.
+- #275: `thrust::counting_iterator` and `thrust::constant_iterator` work
+    correctly with OpenMP as the backend when compiling with optimization.
+- #256: `min` and `max` correctly return their first argument as a tie-breaker
+- #248: `NDEBUG` is interpreted incorrectly
+
+### Known Issues
+
+- NVCC may generate code containing warnings when compiling some Thrust
+    algorithms.
+- When compiling with `-arch=sm_1x`, some Thrust algorithms may cause NVCC to
+    issue benign pointer advisories.
+- When compiling with `-arch=sm_1x` and -G, some Thrust algorithms may fail to
+    execute correctly.
+- `thrust::inclusive_scan`, `thrust::exclusive_scan`,
+    `thrust::inclusive_scan_by_key`, and `thrust::exclusive_scan_by_key` are
+    currently incompatible with `thrust::discard_iterator`.
+
+### Acknowledgments
+
+- Thanks to David Tarjan for improving the performance of set_intersection.
+- Thanks to Duane Merrill for continued help with sort.
+- Thanks to Nathan Whitehead for help with CUDA Toolkit integration.
+
+## Thrust 1.3.0
+
+Thrust 1.3.0 provides support for CUDA Toolkit 3.2 in addition to many feature
+  and performance enhancements.
+Performance of the sort and sort_by_key algorithms is improved by as much as 3x
+  in certain situations.
+The performance of stream compaction algorithms, such as copy_if, is improved
+  by as much as 2x.
+CUDA errors are now converted to runtime exceptions using the system_error
+  interface.
+Combined with a debug mode, also new in 1.3, runtime errors can be located with
+  greater precision.
+Lastly, a few header files have been consolidated or renamed for clarity.
+See the deprecations section below for additional details.
+
+### Breaking Changes
+
+- Promotions
+  - thrust::experimental::inclusive_segmented_scan has been renamed
+      thrust::inclusive_scan_by_key and exposes a different interface
+  - thrust::experimental::exclusive_segmented_scan has been renamed
+      thrust::exclusive_scan_by_key and exposes a different interface
+  - thrust::experimental::partition_copy has been renamed
+      thrust::partition_copy and exposes a different interface
+  - thrust::next::gather has been renamed thrust::gather
+  - thrust::next::gather_if has been renamed thrust::gather_if
+  - thrust::unique_copy_by_key has been renamed thrust::unique_by_key_copy
+- Deprecations
+  - thrust::copy_when has been renamed thrust::deprecated::copy_when
+  - thrust::absolute_value has been renamed thrust::deprecated::absolute_value
+  - The header thrust/set_intersection.h is now deprecated; use
+      thrust/set_operations.h instead
+  - The header thrust/utility.h is now deprecated; use thrust/swap.h instead
+  - The header thrust/swap_ranges.h is now deprecated; use thrust/swap.h instead
+- Eliminations
+  - thrust::deprecated::gather
+  - thrust::deprecated::gather_if
+  - thrust/experimental/arch.h and the functions therein
+  - thrust/sorting/merge_sort.h
+  - thrust/sorting/radix_sort.h
+- NVCC 2.3 is no longer supported
+
+### New Features
+
+- Algorithms:
+  - `thrust::exclusive_scan_by_key`
+  - `thrust::find`
+  - `thrust::find_if`
+  - `thrust::find_if_not`
+  - `thrust::inclusive_scan_by_key`
+  - `thrust::is_partitioned`
+  - `thrust::is_sorted_until`
+  - `thrust::mismatch`
+  - `thrust::partition_point`
+  - `thrust::reverse`
+  - `thrust::reverse_copy`
+  - `thrust::stable_partition_copy`
+
+- Types:
+  - `thrust::system_error` and related types.
+  - `thrust::experimental::cuda::ogl_interop_allocator`.
+  - `thrust::bit_and`, `thrust::bit_or`, and `thrust::bit_xor`.
+
+- Device Support:
+  - GF104-based GPUs.
+
+### New Examples
+
+- opengl_interop.cu
+- repeated_range.cu
+- simple_moving_average.cu
+- sparse_vector.cu
+- strided_range.cu
+
+### Other Enhancements
+
+- Performance of thrust::sort and thrust::sort_by_key is substantially improved
+    for primitive key types
+- Performance of thrust::copy_if is substantially improved
+- Performance of thrust::reduce and related reductions is improved
+- THRUST_DEBUG mode added
+- Callers of Thrust functions may detect error conditions by catching
+    thrust::system_error, which derives from std::runtime_error
+- The number of compiler warnings generated by Thrust has been substantially
+    reduced
+- Comparison sort now works correctly for input sizes > 32M
+- min & max usage no longer collides with <windows.h> definitions
+- Compiling against the OpenMP backend no longer requires nvcc
+- Performance of device_vector initialized in .cpp files is substantially
+    improved in common cases
+- Performance of thrust::sort_by_key on the host is substantially improved
+
+### Bug Fixes
+
+- Debug device code now compiles correctly
+- thrust::uninitialized_copy and thrust::uninitialized_fill now dispatch
+    constructors on the device rather than the host
+
+### Known Issues
+
+- #212 set_intersection is known to fail for large input sizes
+- partition_point is known to fail for 64b types with nvcc 3.2
+
+Acknowledgments
+- Thanks to Duane Merrill for contributing a fast CUDA radix sort implementation
+- Thanks to Erich Elsen for contributing an implementation of find_if
+- Thanks to Andrew Corrigan for contributing changes which allow the OpenMP
+    backend to compile in the absence of nvcc
+- Thanks to Andrew Corrigan, Cliff Wooley, David Coeurjolly, Janick Martinez
+    Esturo, John Bowers, Maxim Naumov, Michael Garland, and Ryuta Suzuki for
+    bug reports
+- Thanks to Cliff Woolley for help with testing
+
+## Thrust 1.2.1
+
+Thrust 1.2.1 is a small bug fix release that is compatible with the CUDA
+  Toolkit 3.1 release.
+
+### Known Issues
+
+- `thrust::inclusive_scan` and `thrust::exclusive_scan` may fail with very
+    large types.
+- MSVC may fail to compile code using both sort and binary search algorithms.
+- `thrust::uninitialized_fill` and `thrust::uninitialized_copy` dispatch
+    constructors on the host rather than the device.
+- #109: Some algorithms may exhibit poor performance with the OpenMP backend
+    with large numbers (>= 6) of CPU threads.
+- `thrust::default_random_engine::discard` is not accelerated with NVCC 2.3
+- NVCC 3.1 may fail to compile code using types derived from
+    `thrust::subtract_with_carry_engine`, such as `thrust::ranlux24` and
+    `thrust::ranlux48`.
+
+## Thrust 1.2.0
+
+Thrust 1.2.0 introduces support for compilation to multicore CPUs and the Ocelot
+  virtual machine, and several new facilities for pseudo-random number
+  generation.
+New algorithms such as set intersection and segmented reduction have also been
+  added.
+Lastly, improvements to the robustness of the CUDA backend ensure correctness
+  across a broad set of (uncommon) use cases.
+
+### Breaking Changes
+
+- `thrust::gather`'s interface was incorrect and has been removed.
+  The old interface is deprecated but will be preserved for Thrust version 1.2
+    at `thrust::deprecated::gather` and `thrust::deprecated::gather_if`.
+  The new interface is provided at `thrust::next::gather` and
+    `thrust::next::gather_if`.
+  The new interface will be promoted to `thrust::` in Thrust version 1.3.
+  For more details, please refer to [this thread](http://groups.google.com/group/thrust-users/browse_thread/thread/f5f0583cb97b51fd).
+- The `thrust::sorting` namespace has been deprecated in favor of the top-level
+    sorting functions, such as `thrust::sort` and `thrust::sort_by_key`.
+- Removed support for `thrust::equal` between host & device sequences.
+- Removed support for `thrust::scatter` between host & device sequences.
+
+### New Features
+
+- Algorithms:
+  - `thrust::reduce_by_key`
+  - `thrust::set_intersection`
+  - `thrust::unique_copy`
+  - `thrust::unique_by_key`
+  - `thrust::unique_copy_by_key`
+- Types
+- Random Number Generation:
+  - `thrust::discard_block_engine`
+  - `thrust::default_random_engine`
+  - `thrust::linear_congruential_engine`
+  - `thrust::linear_feedback_shift_engine`
+  - `thrust::subtract_with_carry_engine`
+  - `thrust::xor_combine_engine`
+  - `thrust::minstd_rand`
+  - `thrust::minstd_rand0`
+  - `thrust::ranlux24`
+  - `thrust::ranlux48`
+  - `thrust::ranlux24_base`
+  - `thrust::ranlux48_base`
+  - `thrust::taus88`
+  - `thrust::uniform_int_distribution`
+  - `thrust::uniform_real_distribution`
+  - `thrust::normal_distribution` (experimental)
+- Function Objects:
+  - `thrust::project1st`
+  - `thrust::project2nd`
+- `thrust::tie`
+- Fancy Iterators:
+  - `thrust::permutation_iterator`
+  - `thrust::reverse_iterator`
+- Vector Functions:
+  - `operator!=`
+  - `rbegin`
+  - `crbegin`
+  - `rend`
+  - `crend`
+  - `data`
+  - `shrink_to_fit`
+- Device Support:
+  - Multicore CPUs via OpenMP.
+  - Fermi-class GPUs.
+  - Ocelot virtual machines.
+- Support for NVCC 3.0.
+
+### New Examples
+
+- `cpp_integration`
+- `histogram`
+- `mode`
+- `monte_carlo`
+- `monte_carlo_disjoint_sequences`
+- `padded_grid_reduction`
+- `permutation_iterator`
+- `row_sum`
+- `run_length_encoding`
+- `segmented_scan`
+- `stream_compaction`
+- `summary_statistics`
+- `transform_iterator`
+- `word_count`
+
+### Other Enhancements
+
+- Integer sorting performance is improved when max is large but (max - min) is
+    small and when min is negative
+- Performance of `thrust::inclusive_scan` and `thrust::exclusive_scan` is
+    improved by 20-25% for primitive types.
+
+### Bug Fixes
+
+- #8 cause a compiler error if the required compiler is not found rather than a
+    mysterious error at link time
+- #42 device_ptr & device_reference are classes rather than structs,
+    eliminating warnings on certain platforms
+- #46 gather & scatter handle any space iterators correctly
+- #51 thrust::experimental::arch functions gracefully handle unrecognized GPUs
+- #52 avoid collisions with common user macros such as BLOCK_SIZE
+- #62 provide better documentation for device_reference
+- #68 allow built-in CUDA vector types to work with device_vector in pure C++
+    mode
+- #102 eliminated a race condition in device_vector::erase
+- various compilation warnings eliminated
+
+### Known Issues
+
+- inclusive_scan & exclusive_scan may fail with very large types
+- MSVC may fail to compile code using both sort and binary search algorithms
+- uninitialized_fill & uninitialized_copy dispatch constructors on the host
+    rather than the device
+- #109 some algorithms may exhibit poor performance with the OpenMP backend
+    with large numbers (>= 6) of CPU threads
+- default_random_engine::discard is not accelerated with nvcc 2.3
+
+### Acknowledgments
+
+- Thanks to Gregory Diamos for contributing a CUDA implementation of
+    set_intersection
+- Thanks to Ryuta Suzuki & Gregory Diamos for rigorously testing Thrust's unit
+    tests and examples against Ocelot
+- Thanks to Tom Bradley for contributing an implementation of normal_distribution
+- Thanks to Joseph Rhoads for contributing the example summary_statistics
+
+## Thrust 1.1.1
+
+Thrust 1.1.1 is a small bug fix release that is compatible with the CUDA
+  Toolkit 2.3a release and Mac OSX Snow Leopard.
+
+## Thrust 1.1.0
+
+Thrust 1.1.0 introduces fancy iterators, binary search functions, and several
+  specialized reduction functions.
+Experimental support for segmented scans has also been added.
+
+### Breaking Changes
+
+- `thrust::counting_iterator` has been moved into the `thrust` namespace
+    (previously `thrust::experimental`).
+
+### New Features
+
+- Algorithms:
+  - `thrust::copy_if`
+  - `thrust::lower_bound`
+  - `thrust::upper_bound`
+  - `thrust::vectorized lower_bound`
+  - `thrust::vectorized upper_bound`
+  - `thrust::equal_range`
+  - `thrust::binary_search`
+  - `thrust::vectorized binary_search`
+  - `thrust::all_of`
+  - `thrust::any_of`
+  - `thrust::none_of`
+  - `thrust::minmax_element`
+  - `thrust::advance`
+  - `thrust::inclusive_segmented_scan` (experimental)
+  - `thrust::exclusive_segmented_scan` (experimental)
+- Types:
+  - `thrust::pair`
+  - `thrust::tuple`
+  - `thrust::device_malloc_allocator`
+- Fancy Iterators:
+  - `thrust::constant_iterator`
+  - `thrust::counting_iterator`
+  - `thrust::transform_iterator`
+  - `thrust::zip_iterator`
+
+### New Examples
+
+- Computing the maximum absolute difference between vectors.
+- Computing the bounding box of a two-dimensional point set.
+- Sorting multiple arrays together (lexicographical sorting).
+- Constructing a summed area table.
+- Using `thrust::zip_iterator` to mimic an array of structs.
+- Using `thrust::constant_iterator` to increment array values.
+
+### Other Enhancements
+
+- Added pinned memory allocator (experimental).
+- Added more methods to host_vector & device_vector (issue #4).
+- Added variant of remove_if with a stencil argument (issue #29).
+- Scan and reduce use cudaFuncGetAttributes to determine grid size.
+- Exceptions are reported when temporary device arrays cannot be allocated.
+
+### Bug Fixes
+
+- #5: Make vector work for larger data types
+- #9: stable_partition_copy doesn't respect OutputIterator concept semantics
+- #10: scans should return OutputIterator
+- #16: make algorithms work for larger data types
+- #27: Dispatch radix_sort even when comp=less<T> is explicitly provided
+
+### Known Issues
+
+- Using functors with Thrust entry points may not compile on Mac OSX with gcc
+    4.0.1.
+- `thrust::uninitialized_copy` and `thrust::uninitialized_fill` dispatch
+    constructors on the host rather than the device.
+- `thrust::inclusive_scan`, `thrust::inclusive_scan_by_key`,
+    `thrust::exclusive_scan`, and `thrust::exclusive_scan_by_key` may fail when
+    used with large types with the CUDA Toolkit 3.1.
+
+## Thrust 1.0.0
+
+First production release of Thrust.
+
+### Breaking Changes
+
+- Rename top level namespace `komrade` to `thrust`.
+- Move `thrust::partition_copy` & `thrust::stable_partition_copy` into
+    `thrust::experimental` namespace until we can easily provide the standard
+    interface.
+- Rename `thrust::range` to `thrust::sequence` to avoid collision with
+    Boost.Range.
+- Rename `thrust::copy_if` to `thrust::copy_when` due to semantic differences
+    with C++0x `std::copy_if`.
+
+### New Features
+
+- Add C++0x style `cbegin` & `cend` methods to `thrust::host_vector` and
+    `thrust::device_vector`.
+- Add `thrust::transform_if` function.
+- Add stencil versions of `thrust::replace_if` & `thrust::replace_copy_if`.
+- Allow `counting_iterator` to work with `thrust::for_each`.
+- Allow types with constructors in comparison `thrust::sort` and
+    `thrust::reduce`.
+
+### Other Enhancements
+
+- `thrust::merge_sort` and `thrust::stable_merge_sort` are now 2x to 5x faster
+    when executed on the parallel device.
+
+### Bug Fixes
+
+- Komrade 6: Workaround an issue where an incremented iterator causes NVCC to
+    crash.
+- Komrade 7: Fix an issue where `const_iterator`s could not be passed to
+    `thrust::transform`.
+
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8a4eca47a..967ebf53a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,10 +1,98 @@
-cmake_minimum_required(VERSION 3.8)
-
-project(Thrust CXX)
-
-set(THRUST_SOURCE ${CMAKE_SOURCE_DIR})
-include(cmake/common_variables.cmake)
-
+# 3.15 is the minimum for including the project with add_subdirectory.
+# 3.17 for building the project's standalone tests/examples/etc.
+# 3.18.3 for C++17 + CUDA
+cmake_minimum_required(VERSION 3.15)
+
+# Remove this when we use the new CUDA_ARCHITECTURES properties with both
+# nvcc and nvc++.
+if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.18)
+  cmake_policy(SET CMP0104 OLD)
+endif()
+
+project(Thrust NONE)
+
+# Determine whether Thrust is the top-level project or included into
+# another project via add_subdirectory()
+if ("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_LIST_DIR}")
+  set(THRUST_TOPLEVEL_PROJECT ON)
+else()
+  set(THRUST_TOPLEVEL_PROJECT OFF)
+endif()
+
+## thrust_fix_clang_nvcc_build_for 
+#
+# Modifies the given target to include a fix for the clang host compiler case.
+# The fix consists of force-including a header into each compilation unit.
+#
+function(thrust_fix_clang_nvcc_build_for target)
+  if (UNIX)
+    # Path to the header containing the fix for clang + nvcc < 11.6. For more info,
+    # check the content of this header.
+    set(clang_fix_header_path "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/testing/fix_clang_nvcc_11.5.h")
+
+    # Only affects host compiler
+    target_compile_options(${target} PRIVATE 
+        "$<$<COMPILE_LANGUAGE:CUDA>:-include${clang_fix_header_path}>")
+  endif()
+endfunction()
+
+# This must be done before any languages are enabled:
+if (THRUST_TOPLEVEL_PROJECT)
+  include(cmake/ThrustCompilerHacks.cmake)
+endif()
+
+# This must appear after our Compiler Hacks or else CMake will delete the cache
+# and reconfigure from scratch.
+# This must also appear before the installation rules, as it is required by the
+# GNUInstallDirs CMake module.
+enable_language(CXX)
+
+# Optionally include installation rules for non-top-level builds:
+option(THRUST_ENABLE_INSTALL_RULES "Enable installation of Thrust" ${THRUST_TOPLEVEL_PROJECT})
+if (THRUST_ENABLE_INSTALL_RULES)
+  include(cmake/ThrustInstallRules.cmake)
+endif()
+
+# Support adding Thrust to a parent project via add_subdirectory.
+# See examples/cmake/add_subdir/CMakeLists.txt for details.
+if (NOT THRUST_TOPLEVEL_PROJECT)
+  include(cmake/ThrustAddSubdir.cmake)
+  return()
+endif()
+
+# We use 3.17 features when building our tests, etc.
+cmake_minimum_required(VERSION 3.17)
+
+option(THRUST_ENABLE_HEADER_TESTING "Test that all public headers compile." "ON")
+option(THRUST_ENABLE_TESTING "Build Thrust testing suite." "ON")
+option(THRUST_ENABLE_EXAMPLES "Build Thrust examples." "ON")
+option(THRUST_ENABLE_BENCHMARKS "Build Thrust runtime benchmarks." "OFF")
+option(THRUST_INCLUDE_CUB_CMAKE "Build CUB tests and examples. (Requires CUDA)." "OFF")
+
+# Mark this option as advanced for now. We'll revisit this later once the new
+# benchmarks are ready. For now, we just need to expose a way to compile
+# bench.cu from CMake for NVIDIA's internal builds.
+mark_as_advanced(THRUST_ENABLE_BENCHMARKS)
+
+# Check if we're actually building anything before continuing. If not, no need
+# to search for deps, etc. This is a common approach for packagers that just
+# need the install rules. See GH issue NVIDIA/thrust#1211.
+if (NOT (THRUST_ENABLE_HEADER_TESTING OR
+         THRUST_ENABLE_TESTING OR
+         THRUST_ENABLE_EXAMPLES OR
+         THRUST_ENABLE_BENCHMARKS OR
+         THRUST_INCLUDE_CUB_CMAKE))
+  return()
+endif()
+
+include(cmake/AppendOptionIfAvailable.cmake)
+include(cmake/ThrustBuildCompilerTargets.cmake)
+include(cmake/ThrustBuildTargetList.cmake)
+include(cmake/ThrustFindThrust.cmake)
+include(cmake/ThrustMultiConfig.cmake)
+include(cmake/ThrustUtilities.cmake)
+
+# Add cache string options for CMAKE_BUILD_TYPE and default to RelWithDebInfo.
 if ("" STREQUAL "${CMAKE_BUILD_TYPE}")
   set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING "Choose the type of build." FORCE)
 
@@ -14,644 +102,54 @@ if ("" STREQUAL "${CMAKE_BUILD_TYPE}")
   )
 endif ()
 
-if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.12)
-  set(CMAKE_CONFIGURE_DEPENDS CONFIGURE_DEPENDS)
-endif ()
-
-list(INSERT CMAKE_MODULE_PATH 0 ${PROJECT_SOURCE_DIR}/cmake)
-include(AppendOptionIfAvailable)
-
-file(READ "thrust/version.h" THRUST_VERSION_HEADER)
-string(REGEX MATCH "THRUST_VERSION ([0-9]+)" DUMMY ${THRUST_VERSION_HEADER})
-set(THRUST_VERSION ${CMAKE_MATCH_1})
-math(EXPR THRUST_VERSION_MAJOR "(${THRUST_VERSION} / 100000)")
-math(EXPR THRUST_VERSION_MINOR "(${THRUST_VERSION} / 100) % 1000")
-math(EXPR THRUST_VERSION_PATCH "${THRUST_VERSION} % 100")
-set(
-  THRUST_VERSION_STR
-  "${THRUST_VERSION_MAJOR}.${THRUST_VERSION_MINOR}.${THRUST_VERSION_PATCH}"
-)
-message(STATUS "Thrust Version: ${THRUST_VERSION_STR}")
-
-set(THRUST_HOST_SYSTEM_OPTIONS CPP OMP TBB)
-set(THRUST_HOST_SYSTEM CPP CACHE STRING "The device backend to target.")
-set_property(
-  CACHE THRUST_HOST_SYSTEM
-  PROPERTY STRINGS ${THRUST_HOST_SYSTEM_OPTIONS}
-)
-if (NOT THRUST_HOST_SYSTEM IN_LIST THRUST_HOST_SYSTEM_OPTIONS)
-  message(
-    FATAL_ERROR
-    "THRUST_HOST_SYSTEM must be one of ${THRUST_HOST_SYSTEM_OPTIONS}"
-  )
-endif ()
-
-add_definitions(-DTHRUST_HOST_SYSTEM=THRUST_HOST_SYSTEM_${THRUST_HOST_SYSTEM})
-
-set(THRUST_DEVICE_SYSTEM_OPTIONS CUDA CPP OMP TBB)
-set(THRUST_DEVICE_SYSTEM CUDA CACHE STRING "The device backend to target.")
-set_property(
-  CACHE THRUST_DEVICE_SYSTEM
-  PROPERTY STRINGS ${THRUST_DEVICE_SYSTEM_OPTIONS}
-)
-if (NOT THRUST_DEVICE_SYSTEM IN_LIST THRUST_DEVICE_SYSTEM_OPTIONS)
-  message(
-    FATAL_ERROR
-    "THRUST_DEVICE_SYSTEM must be one of ${THRUST_DEVICE_SYSTEM_OPTIONS}"
-  )
-endif ()
-
-add_definitions(-DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_${THRUST_DEVICE_SYSTEM})
-
-# Please note this also sets the default for the CUDA C++ version; see the comment below.
-set(CMAKE_CXX_STANDARD 11 CACHE STRING "The C++ version to be used.")
+# Disable compiler extensions:
 set(CMAKE_CXX_EXTENSIONS OFF)
 
-message("-- C++ Standard version: ${CMAKE_CXX_STANDARD}")
-
-if ("CUDA" STREQUAL "${THRUST_DEVICE_SYSTEM}")
-  if (NOT "${CMAKE_CUDA_HOST_COMPILER}" STREQUAL "")
-    unset(CMAKE_CUDA_HOST_COMPILER CACHE)
-    message(FATAL_ERROR "Thrust tests and examples require the C++ compiler"
-        " and the CUDA host compiler to be the same; to set this compiler, please"
-        " use the CMAKE_CXX_COMPILER variable, not the CMAKE_CUDA_HOST_COMPILER"
-        " variable.")
-  endif ()
-  set(CMAKE_CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER})
-
-  enable_language(CUDA)
-
-  # Force CUDA C++ standard to be the same as the C++ standard used.
-  #
-  # Now, CMake is unaligned with reality on standard versions: https://gitlab.kitware.com/cmake/cmake/issues/18597
-  # which means that using standard CMake methods, it's impossible to actually sync the CXX and CUDA versions for pre-11
-  # versions of C++; CUDA accepts 98 but translates that to 03, while CXX doesn't accept 03 (and doesn't translate that to 03).
-  # In case this gives You, dear user, any trouble, please escalate the above CMake bug, so we can support reality properly.
-  if (DEFINED CMAKE_CUDA_STANDARD)
-      message(WARNING "You've set CMAKE_CUDA_STANDARD; please note that this variable is ignored, and CMAKE_CXX_STANDARD"
-          " is used as the C++ standard version for both C++ and CUDA.")
-  endif()
-  unset(CMAKE_CUDA_STANDARD CACHE)
-  set(CMAKE_CUDA_STANDARD ${CMAKE_CXX_STANDARD})
-
-  set(THRUST_HIGHEST_COMPUTE_ARCH 75)
-  set(THRUST_KNOWN_COMPUTE_ARCHS 30 32 35 50 52 53 60 61 62 70 72 75)
-
-  option(THRUST_DISABLE_ARCH_BY_DEFAULT "If ON, then all CUDA architectures are disabled on the initial CMake run." OFF)
-  set(OPTION_INIT ON)
-  if (THRUST_DISABLE_ARCH_BY_DEFAULT)
-    set(OPTION_INIT OFF)
-  endif ()
-
-  if (NOT ${THRUST_HIGHEST_COMPUTE_ARCH} IN_LIST THRUST_KNOWN_COMPUTE_ARCHS)
-    message(FATAL_ERROR "When changing the highest compute version, don't forget to add it to the list!")
-  endif ()
-
-  foreach (COMPUTE_ARCH IN LISTS THRUST_KNOWN_COMPUTE_ARCHS)
-    option(THRUST_ENABLE_COMPUTE_${COMPUTE_ARCH} "Enable code generation for tests for sm_${COMPUTE_ARCH}" ${OPTION_INIT})
-    if (THRUST_ENABLE_COMPUTE_${COMPUTE_ARCH})
-      set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode arch=compute_${COMPUTE_ARCH},code=sm_${COMPUTE_ARCH}")
-      set(COMPUTE_MESSAGE "${COMPUTE_MESSAGE} sm_${COMPUTE_ARCH}")
-    endif ()
-  endforeach ()
-
-  option(THRUST_ENABLE_COMPUTE_FUTURE "Enable code generation for tests for compute_${THRUST_HIGHEST_COMPUTE_ARCH}" ${OPTION_INIT})
-  if (THRUST_ENABLE_COMPUTE_FUTURE)
-    set(CMAKE_CUDA_FLAGS
-      "${CMAKE_CUDA_FLAGS} -gencode arch=compute_${THRUST_HIGHEST_COMPUTE_ARCH},code=compute_${THRUST_HIGHEST_COMPUTE_ARCH}")
-    set(COMPUTE_MESSAGE "${COMPUTE_MESSAGE} compute_${THRUST_HIGHEST_COMPUTE_ARCH}")
-  endif ()
-
-  message("-- Enabled CUDA architectures:${COMPUTE_MESSAGE}")
-endif ()
-
-if ("OMP" STREQUAL "${THRUST_DEVICE_SYSTEM}")
-  find_package(OpenMP REQUIRED)
-  if (OPENMP_FOUND)
-    set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
-    set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
-    set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
-  endif()
-endif ()
-
-if ("TBB" STREQUAL "${THRUST_DEVICE_SYSTEM}")
-  find_package(PkgConfig REQUIRED)
-  pkg_check_modules(TBB tbb REQUIRED)
-  if (TBB_FOUND)
-    set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${TDD_CFLAGS}")
-    set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TDD_CFLAGS}")
-    set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${TBB_LD_FLAGS}")
-    set (THRUST_ADDITIONAL_LIBRARIES "${TBB_LIBRARIES}")
-  endif ()
-
-  # There's a ton of these in the TBB backend, even though the code is correct.
-  # TODO: silence these warnings in code instead
-  append_option_if_available("-Wno-unused-parameter" THRUST_CXX_WARNINGS)
-endif ()
-
-if ("MSVC" STREQUAL "${CMAKE_CXX_COMPILER_ID}")
-  if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 1900)
-    message(FATAL_ERROR "This version of MSVC no longer supported.")
-  endif ()
-endif ()
-
-if ("GNU" STREQUAL "${CMAKE_CXX_COMPILER_ID}")
-  if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.4)
-    message(FATAL_ERROR "This version of GCC no longer supported.")
-  endif ()
-endif ()
-
-if ("MSVC" STREQUAL "${CMAKE_CXX_COMPILER_ID}")
-  # TODO Enable /Wall
-  append_option_if_available("/WX" THRUST_CXX_WARNINGS)
-
-  # Disabled loss-of-data conversion warnings.
-  # TODO Re-enable.
-  append_option_if_available("/wd4244" THRUST_CXX_WARNINGS)
-  append_option_if_available("/wd4267" THRUST_CXX_WARNINGS)
-
-  # Suppress numeric conversion-to-bool warnings.
-  # TODO Re-enable.
-  append_option_if_available("/wd4800" THRUST_CXX_WARNINGS)
-
-  # Disable warning about applying unary operator- to unsigned type.
-  append_option_if_available("/wd4146" THRUST_CXX_WARNINGS)
-
-  set(THRUST_TREAT_FILE_AS_CXX "/TP")
-else ()
-  append_option_if_available("-Werror" THRUST_CXX_WARNINGS)
-  append_option_if_available("-Wall" THRUST_CXX_WARNINGS)
-  append_option_if_available("-Wextra" THRUST_CXX_WARNINGS)
-  append_option_if_available("-Winit-self" THRUST_CXX_WARNINGS)
-  append_option_if_available("-Woverloaded-virtual" THRUST_CXX_WARNINGS)
-  append_option_if_available("-Wcast-qual" THRUST_CXX_WARNINGS)
-  append_option_if_available("-Wno-cast-align" THRUST_CXX_WARNINGS)
-  append_option_if_available("-Wno-long-long" THRUST_CXX_WARNINGS)
-  append_option_if_available("-Wno-variadic-macros" THRUST_CXX_WARNINGS)
-  append_option_if_available("-Wno-unused-function" THRUST_CXX_WARNINGS)
-  append_option_if_available("-Wno-unused-variable" THRUST_CXX_WARNINGS)
-
-  set(THRUST_TREAT_FILE_AS_CXX "-x c++")
-endif ()
-
-if ("GNU" STREQUAL "${CMAKE_CXX_COMPILER_ID}")
-  if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.5)
-    # In GCC 4.4, the CUDA backend's kernel launch templates cause
-    # impossible-to-decipher "'<anonymous>' is used uninitialized in this
-    # function" warnings, so we disable uninitialized variable warnings.
-    append_option_if_available("-Wno-uninitialized" THRUST_CXX_WARNINGS)
-  endif ()
-
-  if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 4.5)
-    # This isn't available until GCC 4.3, and misfires on TMP code until
-    # GCC 4.5.
-    append_option_if_available("-Wlogical-op" THRUST_CXX_WARNINGS)
-  endif ()
-
-  if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 7.3)
-    # GCC 7.3 complains about name mangling changes due to `noexcept`
-    # becoming part of the type system; we don't care.
-    append_option_if_available("-Wno-noexcept-type" THRUST_CXX_WARNINGS)
-  endif ()
-
-  if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 8.1 AND CMAKE_CXX_STANDARD EQUAL 98)
-    # thrust::complex can't really be made trivially copyable in pre-11.
-    # Disable a warning about a non-trivially-copyable type being memmoved that was added to GCC 8.
-    append_option_if_available("-Wno-class-memaccess" THRUST_CXX_WARNINGS)
-  endif ()
-endif ()
-
-if (("Clang" STREQUAL "${CMAKE_CXX_COMPILER_ID}") OR
-    ("XL" STREQUAL "${CMAKE_CXX_COMPILER_ID}"))
-  # xlC and Clang warn about unused parameters in uninstantiated templates.
-  # This causes xlC to choke on the OMP backend, which is mostly #ifdef'd out
-  # (and thus has unused parameters) when you aren't using it.
-  append_option_if_available("-Wno-unused-parameters" THRUST_CXX_WARNINGS)
-endif ()
-
-if ("Clang" STREQUAL "${CMAKE_CXX_COMPILER_ID}")
-  # -Wunneeded-internal-declaration misfires in the unit test framework
-  # on older versions of Clang.
-  append_option_if_available("-Wno-unneeded-internal-declaration" THRUST_CXX_WARNINGS)
-endif ()
-
-foreach (CXX_OPTION IN LISTS THRUST_CXX_WARNINGS)
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXX_OPTION}")
-endforeach ()
-
-if ("CUDA" STREQUAL "${THRUST_DEVICE_SYSTEM}")
-  foreach (CXX_OPTION IN LISTS THRUST_CXX_WARNINGS)
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler=${CXX_OPTION}")
-  endforeach ()
-endif ()
-
-# For every public header, build a translation unit containing `#include <header>`
-# to let the compiler try to figure out warnings in that header if it is not otherwise
-# included in tests, and also to verify if the headers are modular enough.
-# .inl files are not globbed for, because they are not supposed to be used as public
-# entrypoints.
-list(APPEND THRUST_HEADER_GLOBS thrust/*.h)
-list(APPEND THRUST_HEADER_EXCLUDE_SYSTEMS_GLOBS thrust/system/*/*)
-
-string(TOLOWER ${THRUST_HOST_SYSTEM} THRUST_HOST_SYSTEM_LOWERCASE)
-list(APPEND THRUST_HEADER_SYSTEMS_GLOBS thrust/system/${THRUST_HOST_SYSTEM_LOWERCASE}/*)
-
-string(TOLOWER ${THRUST_DEVICE_SYSTEM} THRUST_DEVICE_SYSTEM_LOWERCASE)
-list(APPEND THRUST_HEADER_SYSTEMS_GLOBS thrust/system/${THRUST_DEVICE_SYSTEM_LOWERCASE}/*)
-
-list(APPEND THRUST_HEADER_EXCLUDE_DETAILS_GLOBS thrust/detail/*)
-list(APPEND THRUST_HEADER_EXCLUDE_DETAILS_GLOBS thrust/*/detail/*)
-list(APPEND THRUST_HEADER_EXCLUDE_DETAILS_GLOBS thrust/*/*/detail/*)
-
-# Get all .h files...
-file(
-  GLOB_RECURSE THRUST_HEADERS
-  RELATIVE ${PROJECT_SOURCE_DIR}/thrust
-  ${CMAKE_CONFIGURE_DEPENDS}
-  ${THRUST_HEADER_GLOBS}
-)
-
-# ...then remove all system specific headers...
-file(
-  GLOB_RECURSE THRUST_HEADER_EXCLUDE_SYSTEMS
-  RELATIVE ${PROJECT_SOURCE_DIR}/thrust
-  ${CMAKE_CONFIGURE_DEPENDS}
-  ${THRUST_HEADER_EXCLUDE_SYSTEMS_GLOBS}
-)
-list(REMOVE_ITEM THRUST_HEADERS ${THRUST_HEADER_EXCLUDE_SYSTEMS})
-
-# ...then add all headers specific to the selected host and device systems back again...
-file(
-  GLOB_RECURSE THRUST_SYSTEMS_HEADERS
-  RELATIVE ${PROJECT_SOURCE_DIR}/thrust
-  ${CMAKE_CONFIGURE_DEPENDS}
-  ${THRUST_HEADER_SYSTEMS_GLOBS}
-)
-list(APPEND THRUST_HEADERS ${THRUST_SYSTEMS_HEADERS})
-
-# ...and remove all the detail headers (also removing the detail headers from the selected systems).
-file(
-  GLOB_RECURSE THRUST_HEADER_EXCLUDE_DETAILS
-  RELATIVE ${PROJECT_SOURCE_DIR}/thrust
-  ${CMAKE_CONFIGURE_DEPENDS}
-  ${THRUST_HEADER_EXCLUDE_DETAILS_GLOBS}
-)
-list(REMOVE_ITEM THRUST_HEADERS ${THRUST_HEADER_EXCLUDE_DETAILS})
-
-# List of headers that aren't implemented for all backends, but are implemented for CUDA.
-set(THRUST_PARTIALLY_IMPLEMENTED_HEADERS_CUDA
-  async/copy.h
-  async/for_each.h
-  async/reduce.h
-  async/sort.h
-  async/transform.h
-  event.h
-  future.h
-)
-
-# List of headers that aren't implemented for all backends, but are implemented for CPP.
-set(THRUST_PARTIALLY_IMPLEMENTED_HEADERS_CPP
-)
-
-# List of headers that aren't implemented for all backends, but are implemented for TBB.
-set(THRUST_PARTIALLY_IMPLEMENTED_HEADERS_TBB
-)
-
-# List of headers that aren't implemented for all backends, but are implemented for OMP.
-set(THRUST_PARTIALLY_IMPLEMENTED_HEADERS_OMP
-)
-
-# List of all partially implemented headers.
-set(THRUST_PARTIALLY_IMPLEMENTED_HEADERS
-  emptylistguard
-  ${THRUST_PARTIALLY_IMPLEMENTED_HEADERS_CUDA}
-  ${THRUST_PARTIALLY_IMPLEMENTED_HEADERS_CPP}
-  ${THRUST_PARTIALLY_IMPLEMENTED_HEADERS_TBB}
-  ${THRUST_PARTIALLY_IMPLEMENTED_HEADERS_OMP}
-)
-
-list(REMOVE_DUPLICATES THRUST_PARTIALLY_IMPLEMENTED_HEADERS)
-
-foreach (THRUST_HEADER IN LISTS THRUST_HEADERS)
-  if ("${THRUST_HEADER}" IN_LIST THRUST_PARTIALLY_IMPLEMENTED_HEADERS)
-    # This header is partially implemented on _some_ backends...
-    if (NOT "${THRUST_HEADER}" IN_LIST THRUST_PARTIALLY_IMPLEMENTED_HEADERS_${THRUST_DEVICE_SYSTEM})
-      # ...but not on the selected one.
-      continue()
-    endif ()
-  endif ()
-
-  set(THRUST_HEADER_TEST_EXT .cpp)
-  if ("CUDA" STREQUAL "${THRUST_DEVICE_SYSTEM}")
-    set(THRUST_HEADER_TEST_EXT .cu)
-  endif ()
-
-  set(SOURCE_NAME headers/${THRUST_HEADER}${THRUST_HEADER_TEST_EXT})
-  configure_file(cmake/header_test.in ${SOURCE_NAME})
-
-  list(APPEND THRUST_HEADER_TEST_SOURCES ${SOURCE_NAME})
-endforeach ()
-
-add_library(header-test OBJECT ${THRUST_HEADER_TEST_SOURCES})
-target_include_directories(
-  header-test
-  PUBLIC ${PROJECT_SOURCE_DIR}
-)
-
-include(CTest)
-enable_testing()
-
-# Handle tests.
-
-option(THRUST_ENABLE_TESTS_WITH_RDC "Also build all tests with RDC." OFF)
-
-set(THRUST_TEST_RUN_ARGUMENTS
-  -DTHRUST_SOURCE=${CMAKE_SOURCE_DIR}
-  -P "${CMAKE_SOURCE_DIR}/cmake/run_test.cmake")
-
-list(APPEND THRUST_TESTFRAMEWORK_FILES testing/unittest/testframework.cu)
-if ("CUDA" STREQUAL "${THRUST_DEVICE_SYSTEM}")
-  list(APPEND THRUST_TESTFRAMEWORK_FILES testing/unittest/cuda/testframework.cu)
-else ()
-  # When CUDA is disabled, explain to CMake that testframework.cu is actually a C++ file.
-  set_source_files_properties(testing/unittest/testframework.cu
-    PROPERTIES
-      LANGUAGE CXX
-      COMPILE_FLAGS "${THRUST_TREAT_FILE_AS_CXX}")
-endif ()
-
-add_library(thrust_testframework STATIC ${THRUST_TESTFRAMEWORK_FILES})
-target_include_directories(
-  thrust_testframework
-  PUBLIC ${PROJECT_SOURCE_DIR}
-  PRIVATE ${PROJECT_SOURCE_DIR}/testing
-)
-
-list(APPEND THRUST_TEST_GLOBS testing/*.cu)
-list(APPEND THRUST_TEST_GLOBS testing/*.cpp)
-
-if     ("CUDA" STREQUAL "${THRUST_DEVICE_SYSTEM}")
-  list(APPEND THRUST_TEST_GLOBS testing/cuda/*.cu)
-elseif ("OMP" STREQUAL "${THRUST_DEVICE_SYSTEM}")
-  list(APPEND THRUST_TEST_GLOBS testing/omp/*.cu)
-  list(APPEND THRUST_TEST_GLOBS testing/omp/*.cpp)
-endif ()
-
-file(
-  GLOB THRUST_TESTS
-  RELATIVE ${PROJECT_SOURCE_DIR}/testing
-  ${CMAKE_CONFIGURE_DEPENDS}
-  ${THRUST_TEST_GLOBS}
-)
-
-# List of tests that aren't implemented for all backends, but are implemented for CUDA.
-set(THRUST_PARTIALLY_IMPLEMENTED_CUDA
-    async_copy
-    async_for_each
-    async_reduce
-    async_reduce_into
-    async_sort
-    async_transform
-    event
-    future
-)
-
-# List of tests that aren't implemented for all backends, but are implemented for CPP.
-set(THRUST_PARTIALLY_IMPLEMENTED_CPP
-)
-
-# List of tests that aren't implemented for all backends, but are implemented for TBB.
-set(THRUST_PARTIALLY_IMPLEMENTED_TBB
-)
-
-# List of tests that aren't implemented for all backends, but are implemented for OMP.
-set(THRUST_PARTIALLY_IMPLEMENTED_OMP
-)
-
-# List of all partially implemented tests.
-set(THRUST_PARTIALLY_IMPLEMENTED
-  ${THRUST_PARTIALLY_IMPLEMENTED_CUDA}
-  ${THRUST_PARTIALLY_IMPLEMENTED_CPP}
-  ${THRUST_PARTIALLY_IMPLEMENTED_TBB}
-  ${THRUST_PARTIALLY_IMPLEMENTED_OMP}
-)
-
-if ("CUDA" STREQUAL "${THRUST_DEVICE_SYSTEM}")
-  if (14 EQUAL ${CMAKE_CXX_STANDARD})
-    # Temporarily disable until NVBug 2492786 is fixed.
-    list(APPEND THRUST_PARTIALLY_IMPLEMENTED tuple_algorithms)
-  endif()
-endif ()
-
-list(REMOVE_DUPLICATES THRUST_PARTIALLY_IMPLEMENTED)
-
-foreach (THRUST_TEST_SOURCE IN LISTS THRUST_TESTS)
-  # TODO: Per-test flags.
-
-  set(THRUST_TEST_CREATION_ADDITIONAL)
-  set(THRUST_TEST_ADD_TO_CTEST ON)
-
-  get_filename_component(THRUST_TEST_CATEGORY ${THRUST_TEST_SOURCE} DIRECTORY)
-  if (NOT ("" STREQUAL "${THRUST_TEST_CATEGORY}"))
-    set(THRUST_TEST_CATEGORY "${THRUST_TEST_CATEGORY}.")
-  endif ()
-
-  get_filename_component(THRUST_TEST_NAME ${THRUST_TEST_SOURCE} NAME_WE)
-
-  if ("${THRUST_TEST_NAME}" IN_LIST THRUST_PARTIALLY_IMPLEMENTED)
-    # This test is partially implemented on _some_ backends...
-    if (NOT "${THRUST_TEST_NAME}" IN_LIST THRUST_PARTIALLY_IMPLEMENTED_${THRUST_DEVICE_SYSTEM})
-      # ...but not on the selected one.
-      set(THRUST_TEST_CREATION_ADDITIONAL EXCLUDE_FROM_ALL)
-      set(THRUST_TEST_ADD_TO_CTEST OFF)
-    endif ()
-  endif ()
-
-  set(THRUST_TEST "thrust.test.${THRUST_TEST_CATEGORY}${THRUST_TEST_NAME}")
-
-  if (NOT "CUDA" STREQUAL "${THRUST_DEVICE_SYSTEM}")
-    # Test files are generally .cu; if CUDA is not enabled, CMake doesn't know what to
-    # do with them. But since they are pretty much just C++, we can compile them with
-    # non-nvcc C++ compilers... but we need to tell CMake that they are, in fact, just C++.
-    set_source_files_properties(${PROJECT_SOURCE_DIR}/testing/${THRUST_TEST_SOURCE}
-      PROPERTIES
-        LANGUAGE CXX
-        COMPILE_FLAGS "${THRUST_TREAT_FILE_AS_CXX}")
-  endif ()
-
-  add_executable(
-    ${THRUST_TEST}
-    ${THRUST_TEST_CREATION_ADDITIONAL}
-    # THRUST_TEST_CREATION_ADDITIONAL is actually a CMake keyword (sometimes).
-    ${PROJECT_SOURCE_DIR}/testing/${THRUST_TEST_SOURCE}
-  )
-
-  target_include_directories(
-    ${THRUST_TEST}
-    PUBLIC ${PROJECT_SOURCE_DIR}
-    PRIVATE ${PROJECT_SOURCE_DIR}/testing
-  )
-
-  target_link_libraries(${THRUST_TEST}
-    thrust_testframework
-    ${THRUST_ADDITIONAL_LIBRARIES})
-
-  if (THRUST_TEST_ADD_TO_CTEST)
-    add_test(NAME ${THRUST_TEST}
-      COMMAND ${CMAKE_COMMAND}
-        -DTHRUST_BINARY=$<TARGET_FILE:${THRUST_TEST}>
-        ${THRUST_TEST_RUN_ARGUMENTS})
-  endif ()
-
-  if ("CUDA" STREQUAL "${THRUST_DEVICE_SYSTEM}" AND THRUST_ENABLE_TESTS_WITH_RDC)
-    set(THRUST_TEST_RDC "thrust.test.${THRUST_TEST_CATEGORY}rdc.${THRUST_TEST_NAME}")
-
-    add_executable(
-      ${THRUST_TEST_RDC}
-      ${THRUST_TEST_CREATION_ADDITIONAL}
-      # THRUST_TEST_CREATION_ADDITIONAL is actually a CMake keyword (sometimes).
-      ${PROJECT_SOURCE_DIR}/testing/${THRUST_TEST_SOURCE}
-    )
-
-    target_include_directories(
-      ${THRUST_TEST_RDC}
-      PUBLIC ${PROJECT_SOURCE_DIR}
-      PRIVATE ${PROJECT_SOURCE_DIR}/testing
-    )
-
-    target_link_libraries(${THRUST_TEST_RDC}
-      thrust_testframework
-      ${THRUST_ADDITIONAL_LIBRARIES})
-
-    set_target_properties(${THRUST_TEST_RDC}
-      PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
-
-    if (THRUST_TEST_ADD_TO_CTEST)
-      add_test(NAME ${THRUST_TEST_RDC}
-        COMMAND ${CMAKE_COMMAND}
-          -DTHRUST_BINARY=$<TARGET_FILE:${THRUST_TEST_RDC}>
-          ${THRUST_TEST_RUN_ARGUMENTS})
-    endif ()
-  endif ()
-endforeach ()
-
-# Handle examples.
-
-option(THRUST_EXAMPLE_FILECHECK_PATH "Path to the LLVM FileCheck utility." "")
-option(THRUST_ENABLE_EXAMPLES_WITH_RDC "Also build all examples with RDC." OFF)
-
-set(THRUST_EXAMPLE_FILECHECK_ENABLED OFF)
-if (NOT "" STREQUAL "${THRUST_EXAMPLE_FILECHECK_PATH}")
-  execute_process(
-    COMMAND "${THRUST_EXAMPLE_FILECHECK_PATH}" "${THRUST_FILECHECK_DATA_PATH}/thrust.sanity.filecheck"
-    INPUT_FILE "${CMAKE_SOURCE_DIR}/cmake/sanity"
-    RESULT_VARIABLE THRUST_FILECHECK_RESULT
-  )
-
-  if ("0" STREQUAL "${THRUST_FILECHECK_RESULT}")
-    set(THRUST_EXAMPLE_FILECHECK_ENABLED ON)
-    message("-- FileCheck enabled: ${THRUST_EXAMPLE_FILECHECK_PATH}")
-  endif ()
-endif ()
-
-list(APPEND THRUST_EXAMPLE_GLOBS examples/*.cu)
-list(APPEND THRUST_EXAMPLE_GLOBS examples/*.cpp)
-
-if     ("CUDA" STREQUAL "${THRUST_DEVICE_SYSTEM}")
-  list(APPEND THRUST_EXAMPLE_GLOBS examples/cuda/*.cu)
-elseif ("OMP" STREQUAL "${THRUST_DEVICE_SYSTEM}")
-  list(APPEND THRUST_EXAMPLE_GLOBS examples/omp/*.cu)
-  list(APPEND THRUST_EXAMPLE_GLOBS examples/omp/*.cpp)
-endif ()
-
-if (CMAKE_VERSION VERSION_LESS 3.12)
-  file(
-    GLOB THRUST_EXAMPLES
-    RELATIVE ${PROJECT_SOURCE_DIR}/examples
-    ${THRUST_EXAMPLE_GLOBS}
-    CONFIGURE_DEPENDS
-  )
-else ()
-  file(
-    GLOB THRUST_EXAMPLES
-    RELATIVE ${PROJECT_SOURCE_DIR}/examples
-    ${THRUST_EXAMPLE_GLOBS}
-  )
-endif ()
-
-set(THRUST_EXAMPLE_RUN_ARGUMENTS
-  -DTHRUST_SOURCE=${CMAKE_SOURCE_DIR}
-  -DTHRUST_FILECHECK_ENABLED=${THRUST_EXAMPLE_FILECHECK_ENABLED}
-  -DTHRUST_FILECHECK=${THRUST_EXAMPLE_FILECHECK_PATH}
-  -P "${CMAKE_SOURCE_DIR}/cmake/run_example.cmake")
-
-foreach (THRUST_EXAMPLE_SOURCE IN LISTS THRUST_EXAMPLES)
-  # TODO: Per-example flags.
-
-  get_filename_component(THRUST_EXAMPLE_CATEGORY ${THRUST_EXAMPLE_SOURCE} DIRECTORY)
-  if (NOT ("" STREQUAL "${THRUST_EXAMPLE_CATEGORY}"))
-    set(THRUST_EXAMPLE_CATEGORY "${THRUST_EXAMPLE_CATEGORY}.")
-  endif ()
-
-  get_filename_component(THRUST_EXAMPLE_NAME ${THRUST_EXAMPLE_SOURCE} NAME_WE)
-
-  set(THRUST_EXAMPLE "thrust.example.${THRUST_EXAMPLE_CATEGORY}${THRUST_EXAMPLE_NAME}")
-
-  if (NOT "CUDA" STREQUAL "${THRUST_DEVICE_SYSTEM}")
-    # Example files are generally .cu; if CUDA is not enabled, CMake doesn't know what to
-    # do with them. But since they are pretty much just C++, we can compile them with
-    # non-nvcc C++ compilers... but we need to tell CMake that they are, in fact, just C++.
-    set_source_files_properties(${PROJECT_SOURCE_DIR}/examples/${THRUST_EXAMPLE_SOURCE}
-      PROPERTIES
-        LANGUAGE CXX
-        COMPILE_FLAGS "${THRUST_TREAT_FILE_AS_CXX}")
-  endif ()
-
-  add_executable(
-    ${THRUST_EXAMPLE}
-    ${PROJECT_SOURCE_DIR}/examples/${THRUST_EXAMPLE_SOURCE}
-  )
-
-  target_include_directories(
-    ${THRUST_EXAMPLE}
-    PUBLIC ${PROJECT_SOURCE_DIR}
-    PRIVATE ${PROJECT_SOURCE_DIR}/examples
-  )
-
-  target_link_libraries(${THRUST_EXAMPLE}
-    ${THRUST_ADDITIONAL_LIBRARIES})
-
-  add_test(NAME ${THRUST_EXAMPLE}
-    COMMAND ${CMAKE_COMMAND}
-      -DTHRUST_EXAMPLE=${THRUST_EXAMPLE}
-      -DTHRUST_BINARY=$<TARGET_FILE:${THRUST_EXAMPLE}>
-      ${THRUST_EXAMPLE_RUN_ARGUMENTS})
-
-  if ("CUDA" STREQUAL "${THRUST_DEVICE_SYSTEM}" AND THRUST_ENABLE_EXAMPLES_WITH_RDC)
-    set(THRUST_EXAMPLE_RDC "thrust.example.${THRUST_EXAMPLE_CATEGORY}rdc.${THRUST_EXAMPLE_NAME}")
-
-    add_executable(
-      ${THRUST_EXAMPLE_RDC}
-      ${PROJECT_SOURCE_DIR}/examples/${THRUST_EXAMPLE_SOURCE}
-    )
-
-    target_include_directories(
-      ${THRUST_EXAMPLE_RDC}
-      PUBLIC ${PROJECT_SOURCE_DIR}
-      PRIVATE ${PROJECT_SOURCE_DIR}/examples
-    )
-
-    target_link_libraries(${THRUST_EXAMPLE_RDC}
-      ${THRUST_ADDITIONAL_LIBRARIES})
-
-    set_target_properties(${THRUST_EXAMPLE_RDC}
-      PROPERTIES CUDA_SEPERABLE_COMPILATION ON)
-
-    add_test(NAME ${THRUST_EXAMPLE_RDC}
-      COMMAND ${CMAKE_COMMAND}
-        -DTHRUST_EXAMPLE=${THRUST_EXAMPLE}
-        -DTHRUST_BINARY=$<TARGET_FILE:${THRUST_EXAMPLE_RDC}>
-        ${THRUST_EXAMPLE_RUN_ARGUMENTS})
-  endif ()
-endforeach ()
-
+# Where to put build outputs. Use CMAKE_BINARY_DIR so they'll show up in the
+# top-level project's dir when building Thrust via add_subdirectory.
+set(THRUST_LIBRARY_OUTPUT_DIR "${CMAKE_BINARY_DIR}/lib")
+set(THRUST_EXECUTABLE_OUTPUT_DIR "${CMAKE_BINARY_DIR}/bin")
+
+thrust_configure_multiconfig()
+thrust_find_thrust()
+thrust_build_compiler_targets()
+thrust_update_system_found_flags()
+if (THRUST_CUDA_FOUND)
+  include(cmake/ThrustCudaConfig.cmake)
+endif()
+thrust_build_target_list()
+
+message(STATUS "CPP system found?  ${THRUST_CPP_FOUND}")
+message(STATUS "CUDA system found? ${THRUST_CUDA_FOUND}")
+message(STATUS "TBB system found?  ${THRUST_TBB_FOUND}")
+message(STATUS "OMP system found?  ${THRUST_OMP_FOUND}")
+
+if (THRUST_ENABLE_HEADER_TESTING)
+  include(cmake/ThrustHeaderTesting.cmake)
+endif()
+
+# Both testing and examples use ctest
+if (THRUST_ENABLE_TESTING OR THRUST_ENABLE_EXAMPLES)
+  include(CTest)
+  enable_testing()
+endif()
+
+if (THRUST_ENABLE_TESTING)
+  add_subdirectory(testing)
+endif()
+
+if (THRUST_ENABLE_EXAMPLES)
+  add_subdirectory(examples)
+endif()
+
+if (THRUST_ENABLE_BENCHMARKS)
+  add_subdirectory(internal/benchmark)
+endif()
+
+if (THRUST_INCLUDE_CUB_CMAKE AND THRUST_CUDA_FOUND)
+  set(CUB_IN_THRUST ON)
+  # CUB's path is specified generically to support both GitHub and Perforce
+  # source tree layouts. The include directory used by cub-config.cmake
+  # for source layouts is the same as the project root.
+  add_subdirectory("${_CUB_INCLUDE_DIR}" dependencies/cub)
+endif()
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
new file mode 100644
index 000000000..8c56af363
--- /dev/null
+++ b/CODE_OF_CONDUCT.md
@@ -0,0 +1,96 @@
+# Code of Conduct
+
+## Overview
+
+This document defines the Code of Conduct followed and enforced for NVIDIA C++
+  Core Compute Libraries.
+
+### Intended Audience
+
+* Community
+* Developers
+* Project Leads
+
+## Our Pledge
+
+In the interest of fostering an open and welcoming environment, we as
+  contributors and maintainers pledge to making participation in our project and
+  our community a harassment-free experience for everyone, regardless of age,
+  body size, disability, ethnicity, sex characteristics, gender identity and
+  expression, level of experience, education, socio-economic status, nationality,
+  personal appearance, race, religion, or sexual identity and orientation.
+
+## Our Standards
+
+Examples of behavior that contributes to creating a positive environment include:
+
+- Using welcoming and inclusive language.
+- Being respectful of differing viewpoints and experiences.
+- Gracefully accepting constructive criticism.
+- Focusing on what is best for the community.
+- Showing empathy towards other community members.
+
+Examples of unacceptable behavior by participants include:
+
+- The use of sexualized language or imagery and unwelcome sexual attention or
+    advances.
+- Trolling, insulting/derogatory comments, and personal or political attacks.
+- Public or private harassment.
+- Publishing others’ private information, such as a physical or electronic
+    address, without explicit permission.
+- Other conduct which could reasonably be considered inappropriate.
+
+## Our Responsibilities
+
+Project maintainers are responsible for clarifying the standards of acceptable
+  behavior and are expected to take appropriate and fair corrective action in
+  response to any instances of unacceptable behavior.
+
+Project maintainers have the right and responsibility to remove, edit, or
+  reject comments, commits, code, wiki edits, issues, and other contributions
+  that are not aligned to this Code of Conduct, or to ban temporarily or
+  permanently any contributor for other behaviors that they deem inappropriate,
+  threatening, offensive, or harmful.
+
+## Scope
+
+This Code of Conduct applies both within project spaces and in public spaces
+  when an individual is representing the project or its community.
+Examples of representing a project or community include using an official
+  project email address, posting via an official social media account, or acting
+  as an appointed representative at an online or offline event.
+Representation of a project may be further defined and clarified by project
+  maintainers.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+  reported by contacting [cpp-conduct@nvidia.com](mailto:cpp-conduct@nvidia.com).
+All complaints will be reviewed and investigated and will result in a response
+  that is deemed necessary and appropriate to the circumstances.
+The project team is obligated to maintain confidentiality with regard to the
+  reporter of an incident.
+Further details of specific enforcement policies may be posted separately.
+
+Project maintainers who do not follow or enforce the Code of Conduct in good
+  faith may face temporary or permanent repercussions as determined by other
+  members of the project’s leadership.
+
+## Attribution
+
+This Code of Conduct was taken from the [NVIDIA RAPIDS] project, which was
+  adapted from the [Contributor Covenant version 1.4].
+
+Please see this [FAQ] for answers to common questions about this Code of Conduct.
+
+## Contact
+
+Please email [cpp-conduct@nvidia.com] for any Code of Conduct related matters.
+
+
+[cpp-conduct@nvidia.com]: mailto:cpp-conduct@nvidia.com
+
+[FAQ]: https://www.contributor-covenant.org/faq
+
+[NVIDIA RAPIDS]: https://docs.rapids.ai/resources/conduct/
+[Contributor Covenant version 1.4]: https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
diff --git a/LICENSE b/LICENSE
index e454a5258..c22c22563 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,3 +1,7 @@
+Unless otherwise noted, Thrust's source code is released under the Apache
+License, Version 2.0:
+
+================================================================================
 
                                  Apache License
                            Version 2.0, January 2004
@@ -174,5 +178,72 @@
       incurred by, or claims asserted against, such Contributor by reason
       of your accepting any such warranty or additional liability.
 
-   END OF TERMS AND CONDITIONS
-
+================================================================================
+
+Some portions of Thrust may be licensed under other compatible open-source
+licenses. Any divergence from the Apache 2 license will be noted in the source
+code where applicable.
+
+Portions under other terms include, but are not limited to:
+
+================================================================================
+
+Various C++ utility classes in Thrust are based on the Boost Iterator, Tuple,
+System, and Random Number libraries, which are provided under the Boost Software
+License:
+
+    Boost Software License - Version 1.0 - August 17th, 2003
+
+    Permission is hereby granted, free of charge, to any person or organization
+    obtaining a copy of the software and accompanying documentation covered by
+    this license (the "Software") to use, reproduce, display, distribute,
+    execute, and transmit the Software, and to prepare derivative works of the
+    Software, and to permit third-parties to whom the Software is furnished to
+    do so, all subject to the following:
+
+    The copyright notices in the Software and this entire statement, including
+    the above license grant, this restriction and the following disclaimer,
+    must be included in all copies of the Software, in whole or in part, and
+    all derivative works of the Software, unless such copies or derivative
+    works are solely in the form of machine-executable object code generated by
+    a source language processor.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+    SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+    FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+    ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+    DEALINGS IN THE SOFTWARE.
+
+================================================================================
+
+Portions of the thrust::complex implementation are derived from FreeBSD with the
+following terms:
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions
+    are met:
+
+    1. Redistributions of source code must retain the above copyright
+       notice[1] unmodified, this list of conditions, and the following
+       disclaimer.
+    2. Redistributions in binary form must reproduce the above copyright
+       notice, this list of conditions and the following disclaimer in the
+       documentation and/or other materials provided with the distribution.
+
+    THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+    IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+    OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+    IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+    INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+    NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+    THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+[1] Individual copyright notices from the original authors are included in
+    the relevant source files.
+
+================================================================================
diff --git a/Makefile b/Makefile
index 12f9d964c..4b5a4a423 100644
--- a/Makefile
+++ b/Makefile
@@ -1,39 +1,25 @@
-# Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+# Copyright 2010-2020 NVIDIA Corporation.
 #
-# NOTICE TO USER:
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
 #
-# This source code is subject to NVIDIA ownership rights under U.S. and
-# international Copyright laws.
+#		http://www.apache.org/licenses/LICENSE-2.0
 #
-# This software and the information contained herein is being provided
-# under the terms and conditions of a Source Code License Agreement.
-#
-# NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
-# CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
-# IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
-# REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
-# MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
-# IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
-# OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
-# OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
-# OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE
-# OR PERFORMANCE OF THIS SOURCE CODE.
-#
-# U.S. Government End Users.   This source code is a "commercial item" as
-# that term is defined at  48 C.F.R. 2.101 (OCT 1995), consisting  of
-# "commercial computer  software"  and "commercial computer software
-# documentation" as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995)
-# and is provided to the U.S. Government only as a commercial end item.
-# Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
-# 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
-# source code with only those rights set forth herein.
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 # Makefile for building Thrust unit test driver
 
 # Force C++11 mode. NVCC will ignore it if the host compiler doesn't support it.
-#export CXX_STD = c++11
+export CXX_STD := c++11
 
-export VERBOSE = 1
+export CCCL_ENABLE_DEPRECATIONS := 1
+
+export VERBOSE := 1
 
 ifndef PROFILE
   ifdef VULCAN_TOOLKIT_BASE
@@ -53,10 +39,6 @@ else
   include ../build/config/DetectOS.mk
 endif
 
-ifeq ($(OS),win32)
-  export I_AM_SLOPPY := 1
-endif
-
 TMP_DIR      := built
 TMP_PREFIX   := $(ROOTDIR)
 TMP_ARCH     := $(ARCH)_$(PROFILE)_agnostic
@@ -129,50 +111,20 @@ else
   include ../build/common.mk
 endif
 
-# Print host compiler version.
-
-VERSION_FLAG :=
-ifeq ($(OS),$(filter $(OS),Linux Darwin))
-  ifdef USEPGCXX        # PGI
-    VERSION_FLAG := -V
-  else
-    ifdef USEXLC        # XLC
-      VERSION_FLAG := -qversion
-    else                # GCC, ICC or Clang AKA the sane ones.
-      VERSION_FLAG := --version
-    endif
-  endif
-else ifeq ($(OS),win32) # MSVC
-  # cl.exe run without any options will print its version info and exit.
-  VERSION_FLAG :=
-endif
-
-CCBIN_ENVIRONMENT :=
-ifeq ($(OS), QNX)
-  # QNX's GCC complains if QNX_HOST and QNX_TARGET aren't defined in the
-  # environment.
-  CCBIN_ENVIRONMENT := QNX_HOST=$(QNX_HOST) QNX_TARGET=$(QNX_TARGET)
-endif
-
-$(info #### CCBIN         : $(CCBIN))
-$(info #### CCBIN VERSION : $(shell $(CCBIN_ENVIRONMENT) $(CCBIN) $(VERSION_FLAG)))
-$(info #### CXX_STD       : $(CXX_STD))
-
 ifeq ($(OS), win32)
-  CREATE_DVS_PACKAGE = $(ZIP) -r built/CUDA-thrust-package.zip bin thrust/internal/test thrust/internal/scripts thrust/internal/benchmark thrust/*.trs $(DVS_COMMON_TEST_PACKAGE_FILES)
-  APPEND_HEADERS_DVS_PACKAGE = $(ZIP) -rg built/CUDA-thrust-package.zip thrust -9 -i *.h
+  CREATE_DVS_PACKAGE = $(ZIP) -r built/CUDA-thrust-package.zip bin thrust/internal/test thrust/internal/scripts thrust/internal/benchmark $(DVS_COMMON_TEST_PACKAGE_FILES)
+  APPEND_H_DVS_PACKAGE = $(ZIP) -rg built/CUDA-thrust-package.zip thrust -9 -i *.h
   APPEND_INL_DVS_PACKAGE = $(ZIP) -rg built/CUDA-thrust-package.zip thrust -9 -i *.inl
   APPEND_CUH_DVS_PACKAGE = $(ZIP) -rg built/CUDA-thrust-package.zip thrust -9 -i *.cuh
-  MAKE_DVS_PACKAGE = $(CREATE_DVS_PACKAGE) && $(APPEND_HEADERS_DVS_PACKAGE) && $(APPEND_INL_DVS_PACKAGE) && $(APPEND_CUH_DVS_PACKAGE)
+  MAKE_DVS_PACKAGE = $(CREATE_DVS_PACKAGE) && $(APPEND_H_DVS_PACKAGE) && $(APPEND_INL_DVS_PACKAGE) && $(APPEND_CUH_DVS_PACKAGE)
 else
-  CREATE_DVS_PACKAGE = tar -cv -f built/CUDA-thrust-package.tar bin thrust/internal/test thrust/internal/scripts thrust/internal/benchmark thrust/*.trs $(DVS_COMMON_TEST_PACKAGE_FILES)
-  APPEND_HEADERS_DVS_PACKAGE = find thrust -name "*.h" | xargs tar rvf built/CUDA-thrust-package.tar
-  APPEND_INL_DVS_PACKAGE = find thrust -name "*.inl" | xargs tar rvf built/CUDA-thrust-package.tar
-  APPEND_CUH_DVS_PACKAGE = find thrust -name "*.cuh" | xargs tar rvf built/CUDA-thrust-package.tar
-  COMPRESS_DVS_PACKAGE = bzip2 built/CUDA-thrust-package.tar
-  MAKE_DVS_PACKAGE = $(CREATE_DVS_PACKAGE) && $(APPEND_HEADERS_DVS_PACKAGE) && $(APPEND_INL_DVS_PACKAGE) && $(APPEND_CUH_DVS_PACKAGE) && $(COMPRESS_DVS_PACKAGE)
+  TAR_FILES = bin thrust/internal/test thrust/internal/scripts thrust/internal/benchmark $(DVS_COMMON_TEST_PACKAGE_FILES)
+  TAR_FILES += `find -L thrust \( -name "*.cuh" -o -name "*.h" -o -name "*.inl" \)`
+  MAKE_DVS_PACKAGE = tar -I bzip2 -chvf built/CUDA-thrust-package.tar.bz2 $(TAR_FILES)
 endif
 
+COPY_CUB_FOR_PACKAGING = rm -rf cub && cp -rp ../cub/cub cub
+
 DVS_OPTIONS :=
 
 ifneq ($(TARGET_ARCH),$(HOST_ARCH))
@@ -185,16 +137,20 @@ endif
 THRUST_DVS_BUILD = release
 
 pack:
+	$(COPY_CUB_FOR_PACKAGING)
 	cd .. && $(MAKE_DVS_PACKAGE)
 
 dvs:
+	$(COPY_CUB_FOR_PACKAGING)
+# Build the CUDA Runtime in GVS, because GVS has no CUDA Runtime component.
+# This is a temporary workaround until the Tegra team adds a CUDA Runtime
+# component, which they have promised to do.
+ifdef GVS
 	$(MAKE) $(DVS_OPTIONS) -s -C ../cuda $(THRUST_DVS_BUILD)
+endif
 	$(MAKE) $(DVS_OPTIONS) $(THRUST_DVS_BUILD) THRUST_DVS=1
 	cd .. && $(MAKE_DVS_PACKAGE)
 
-# XXX Deprecated, remove.
-dvs_nightly: dvs
-
 dvs_release:
 	$(MAKE) dvs THRUST_DVS_BUILD=release
 
diff --git a/NOTICE b/NOTICE
deleted file mode 100644
index 1ce1dcc29..000000000
--- a/NOTICE
+++ /dev/null
@@ -1,26 +0,0 @@
-Thrust includes source code from the Boost Iterator, Tuple, System, and Random Number libraries.
-
-    Boost Software License - Version 1.0 - August 17th, 2003
-    
-    Permission is hereby granted, free of charge, to any person or organization
-    obtaining a copy of the software and accompanying documentation covered by
-    this license (the "Software") to use, reproduce, display, distribute,
-    execute, and transmit the Software, and to prepare derivative works of the
-    Software, and to permit third-parties to whom the Software is furnished to
-    do so, all subject to the following:
-    
-    The copyright notices in the Software and this entire statement, including
-    the above license grant, this restriction and the following disclaimer,
-    must be included in all copies of the Software, in whole or in part, and
-    all derivative works of the Software, unless such copies or derivative
-    works are solely in the form of machine-executable object code generated by
-    a source language processor.
-    
-    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-    FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
-    SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
-    FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
-    ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-    DEALINGS IN THE SOFTWARE.
-
diff --git a/README.md b/README.md
index 37c26ba90..b885389d4 100644
--- a/README.md
+++ b/README.md
@@ -1,78 +1,253 @@
-Thrust: Code at the speed of light
-==================================
+:warning: **The Thrust repository has been archived and is now part of the unified [nvidia/cccl repository](https://github.com/nvidia/cccl). See the [announcement here](https://github.com/NVIDIA/cccl/discussions/520) for more information. Please visit the new repository for the latest updates.** :warning:
 
-Thrust is a C++ parallel programming library which resembles the C++ Standard
-Library. Thrust's **high-level** interface greatly enhances
-programmer **productivity** while enabling performance portability between
-GPUs and multicore CPUs. **Interoperability** with established technologies
-(such as CUDA, TBB, and OpenMP) facilitates integration with existing
-software. Develop **high-performance** applications rapidly with Thrust!
+# Thrust: The C++ Parallel Algorithms Library
 
-Thrust is distributed with the CUDA Toolkit in addition to GitHub.
+<table><tr>
+<th><b><a href="https://github.com/nvidia/thrust/tree/main/examples">Examples</a></b></th>
+<th><b><a href="https://godbolt.org/z/8E8W764E6">Godbolt</a></b></th>
+<th><b><a href="https://nvidia.github.io/thrust">Documentation</a></b></th>
+</tr></table>
 
-Examples
---------
+Thrust is the C++ parallel algorithms library which inspired the introduction
+  of parallel algorithms to the C++ Standard Library.
+Thrust's **high-level** interface greatly enhances programmer **productivity**
+  while enabling performance portability between GPUs and multicore CPUs.
+It builds on top of established parallel programming frameworks (such as CUDA,
+  TBB, and OpenMP).
+It also provides a number of general-purpose facilities similar to those found
+  in the C++ Standard Library.
 
-Thrust is best explained through examples. The following source code
-generates random numbers serially and then transfers them to a parallel
-device where they are sorted.
+The NVIDIA C++ Standard Library is an open source project; it is available on
+  [GitHub] and included in the NVIDIA HPC SDK and CUDA Toolkit.
+If you have one of those SDKs installed, no additional installation or compiler
+  flags are needed to use libcu++.
 
-```c++
+## Examples
+
+Thrust is best learned through examples.
+
+The following example generates random numbers serially and then transfers them
+  to a parallel device where they are sorted.
+
+```cuda
 #include <thrust/host_vector.h>
 #include <thrust/device_vector.h>
 #include <thrust/generate.h>
 #include <thrust/sort.h>
 #include <thrust/copy.h>
-#include <algorithm>
-#include <cstdlib>
+#include <thrust/random.h>
 
-int main(void)
-{
-  // generate 32M random numbers serially
+int main() {
+  // Generate 32M random numbers serially.
+  thrust::default_random_engine rng(1337);
+  thrust::uniform_int_distribution<int> dist;
   thrust::host_vector<int> h_vec(32 << 20);
-  std::generate(h_vec.begin(), h_vec.end(), rand);
+  thrust::generate(h_vec.begin(), h_vec.end(), [&] { return dist(rng); });
 
-  // transfer data to the device
+  // Transfer data to the device.
   thrust::device_vector<int> d_vec = h_vec;
 
-  // sort data on the device (846M keys per second on GeForce GTX 480)
+  // Sort data on the device.
   thrust::sort(d_vec.begin(), d_vec.end());
 
-  // transfer data back to host
+  // Transfer data back to host.
   thrust::copy(d_vec.begin(), d_vec.end(), h_vec.begin());
-
-  return 0;
 }
 ```
-  
-This code sample computes the sum of 100 random numbers in parallel:
 
-```c++
+[See it on Godbolt](https://godbolt.org/z/GeWEd8Er9)
+
+This example demonstrates computing the sum of some random numbers in parallel:
+
+```cuda
 #include <thrust/host_vector.h>
 #include <thrust/device_vector.h>
 #include <thrust/generate.h>
 #include <thrust/reduce.h>
 #include <thrust/functional.h>
-#include <algorithm>
-#include <cstdlib>
+#include <thrust/random.h>
+
+int main() {
+  // Generate random data serially.
+  thrust::default_random_engine rng(1337);
+  thrust::uniform_real_distribution<double> dist(-50.0, 50.0);
+  thrust::host_vector<double> h_vec(32 << 20);
+  thrust::generate(h_vec.begin(), h_vec.end(), [&] { return dist(rng); });
+
+  // Transfer to device and compute the sum.
+  thrust::device_vector<double> d_vec = h_vec;
+  double x = thrust::reduce(d_vec.begin(), d_vec.end(), 0, thrust::plus<int>());
+}
+```
 
-int main(void)
-{
-  // generate random data serially
-  thrust::host_vector<int> h_vec(100);
-  std::generate(h_vec.begin(), h_vec.end(), rand);
+[See it on Godbolt](https://godbolt.org/z/cnsbWWME7)
 
-  // transfer to device and compute sum
-  thrust::device_vector<int> d_vec = h_vec;
-  int x = thrust::reduce(d_vec.begin(), d_vec.end(), 0, thrust::plus<int>());
-  return 0;
+This example show how to perform such a reduction asynchronously:
+
+```cuda
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+#include <thrust/generate.h>
+#include <thrust/async/copy.h>
+#include <thrust/async/reduce.h>
+#include <thrust/functional.h>
+#include <thrust/random.h>
+#include <numeric>
+
+int main() {
+  // Generate 32M random numbers serially.
+  thrust::default_random_engine rng(123456);
+  thrust::uniform_real_distribution<double> dist(-50.0, 50.0);
+  thrust::host_vector<double> h_vec(32 << 20);
+  thrust::generate(h_vec.begin(), h_vec.end(), [&] { return dist(rng); });
+
+  // Asynchronously transfer to the device.
+  thrust::device_vector<double> d_vec(h_vec.size());
+  thrust::device_event e = thrust::async::copy(h_vec.begin(), h_vec.end(),
+                                               d_vec.begin());
+
+  // After the transfer completes, asynchronously compute the sum on the device.
+  thrust::device_future<double> f0 = thrust::async::reduce(thrust::device.after(e),
+                                                           d_vec.begin(), d_vec.end(),
+                                                           0.0, thrust::plus<double>());
+
+  // While the sum is being computed on the device, compute the sum serially on
+  // the host.
+  double f1 = std::accumulate(h_vec.begin(), h_vec.end(), 0.0, thrust::plus<double>());
 }
 ```
 
-Refer to the [Quick Start Guide](http://github.com/thrust/thrust/wiki/Quick-Start-Guide) page for further information and examples.
+[See it on Godbolt](https://godbolt.org/z/be54efaKj)
+
+## Getting The Thrust Source Code
+
+Thrust is a header-only library; there is no need to build or install the project
+unless you want to run the Thrust unit tests.
+
+The CUDA Toolkit provides a recent release of the Thrust source code in
+`include/thrust`. This will be suitable for most users.
+
+Users that wish to contribute to Thrust or try out newer features should
+recursively clone the Thrust Github repository:
+
+```
+git clone --recursive https://github.com/NVIDIA/thrust.git
+```
+
+## Using Thrust From Your Project
+
+For CMake-based projects, we provide a CMake package for use with
+`find_package`. See the [CMake README](thrust/cmake/README.md) for more
+information. Thrust can also be added via `add_subdirectory` or tools like
+the [CMake Package Manager](https://github.com/cpm-cmake/CPM.cmake).
+
+For non-CMake projects, compile with:
+- The Thrust include path (`-I<thrust repo root>`)
+- The libcu++ include path (`-I<thrust repo root>/dependencies/libcudacxx/`)
+- The CUB include path, if using the CUDA device system (`-I<thrust repo root>/dependencies/cub/`)
+- By default, the CPP host system and CUDA device system are used.
+  These can be changed using compiler definitions:
+  - `-DTHRUST_HOST_SYSTEM=THRUST_HOST_SYSTEM_XXX`,
+     where `XXX` is `CPP` (serial, default), `OMP` (OpenMP), or `TBB` (Intel TBB)
+  - `-DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_XXX`, where `XXX` is
+    `CPP`, `OMP`, `TBB`, or `CUDA` (default).
+
+## Developing Thrust
+
+Thrust uses the [CMake build system] to build unit tests, examples, and header
+  tests.
+To build Thrust as a developer, it is recommended that you use our
+  containerized development system:
+
+```bash
+# Clone Thrust and CUB repos recursively:
+git clone --recursive https://github.com/NVIDIA/thrust.git
+cd thrust
+
+# Build and run tests and examples:
+ci/local/build.bash
+```
+
+That does the equivalent of the following, but in a clean containerized
+  environment which has all dependencies installed:
+
+```bash
+# Clone Thrust and CUB repos recursively:
+git clone --recursive https://github.com/NVIDIA/thrust.git
+cd thrust
+
+# Create build directory:
+mkdir build
+cd build
+
+# Configure -- use one of the following:
+cmake ..   # Command line interface.
+ccmake ..  # ncurses GUI (Linux only).
+cmake-gui  # Graphical UI, set source/build directories in the app.
+
+# Build:
+cmake --build . -j ${NUM_JOBS} # Invokes make (or ninja, etc).
+
+# Run tests and examples:
+ctest
+```
+
+By default, a serial `CPP` host system, `CUDA` accelerated device system, and
+  C++14 standard are used.
+This can be changed in CMake and via flags to `ci/local/build.bash`
+
+More information on configuring your Thrust build and creating a pull request
+  can be found in the [contributing section].
+
+## Licensing
+
+Thrust is an open source project developed on [GitHub].
+Thrust is distributed under the [Apache License v2.0 with LLVM Exceptions];
+  some parts are distributed under the [Apache License v2.0] and the
+  [Boost License v1.0].
+
+## CI Status
+
+<a href='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-gpu-build/CXX_TYPE=gcc,CXX_VER=9,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.7.0-devel/'><img src='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-gpu-build/CXX_TYPE=gcc,CXX_VER=9,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.7.0-devel/badge/icon?subject=NVCC%2011.7.0%20%2B%20GCC%209%20build%20and%20device%20tests'></a>
+
+<a href='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=gcc,CXX_VER=11,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.7.0-devel/'><img src='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=gcc,CXX_VER=11,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.7.0-devel/badge/icon?subject=NVCC%2011.7.0%20%2B%20GCC%2011%20build%20and%20host%20tests'></a>
+
+<a href='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=gcc,CXX_VER=10,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.7.0-devel/'><img src='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=gcc,CXX_VER=10,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.7.0-devel/badge/icon?subject=NVCC%2011.7.0%20%2B%20GCC%2010%20build%20and%20host%20tests'></a>
+
+<a href='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=gcc,CXX_VER=9,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.7.0-devel/'><img src='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=gcc,CXX_VER=9,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.7.0-devel/badge/icon?subject=NVCC%2011.7.0%20%2B%20GCC%209%20build%20and%20host%20tests'></a>
+
+<a href='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=gcc,CXX_VER=8,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.7.0-devel/'><img src='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=gcc,CXX_VER=8,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.7.0-devel/badge/icon?subject=NVCC%2011.7.0%20%2B%20GCC%208%20build%20and%20host%20tests'></a>
+
+<a href='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=gcc,CXX_VER=7,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.7.0-devel/'><img src='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=gcc,CXX_VER=7,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.7.0-devel/badge/icon?subject=NVCC%2011.7.0%20%2B%20GCC%207%20build%20and%20host%20tests'></a>
+
+<a href='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=gcc,CXX_VER=6,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.7.0-devel/'><img src='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=gcc,CXX_VER=6,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.7.0-devel/badge/icon?subject=NVCC%2011.7.0%20%2B%20GCC%206%20build%20and%20host%20tests'></a>
+
+<a href='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=gcc,CXX_VER=5,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.7.0-devel/'><img src='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=gcc,CXX_VER=5,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.7.0-devel/badge/icon?subject=NVCC%2011.7.0%20%2B%20GCC%205%20build%20and%20host%20tests'></a>
+
+<a href='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=clang,CXX_VER=12,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.7.0-devel/'><img src='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=clang,CXX_VER=12,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.7.0-devel/badge/icon?subject=NVCC%2011.7.0%20%2B%20Clang%2012%20build%20and%20host%20tests'></a>
+
+<a href='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=clang,CXX_VER=11,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.7.0-devel/'><img src='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=clang,CXX_VER=11,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.7.0-devel/badge/icon?subject=NVCC%2011.7.0%20%2B%20Clang%2011%20build%20and%20host%20tests'></a>
+
+<a href='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=clang,CXX_VER=10,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.7.0-devel/'><img src='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=clang,CXX_VER=10,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.7.0-devel/badge/icon?subject=NVCC%2011.7.0%20%2B%20Clang%2010%20build%20and%20host%20tests'></a>
+
+<a href='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=clang,CXX_VER=9,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.7.0-devel/'><img src='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=clang,CXX_VER=9,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.7.0-devel/badge/icon?subject=NVCC%2011.7.0%20%2B%20Clang%209%20build%20and%20host%20tests'></a>
+
+<a href='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=clang,CXX_VER=8,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.7.0-devel/'><img src='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=clang,CXX_VER=8,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.7.0-devel/badge/icon?subject=NVCC%2011.7.0%20%2B%20Clang%208%20build%20and%20host%20tests'></a>
+
+<a href='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=clang,CXX_VER=7,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.7.0-devel/'><img src='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=clang,CXX_VER=7,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.7.0-devel/badge/icon?subject=NVCC%2011.7.0%20%2B%20Clang%207%20build%20and%20host%20tests'></a>
+
+<a href='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=icc,CXX_VER=latest,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.7.0-devel/'><img src='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=icc,CXX_VER=latest,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.7.0-devel/badge/icon?subject=NVCC%2011.7.0%20%2B%20ICC%20build%20and%20host%20tests'></a>
+
+
+
+[GitHub]: https://github.com/nvidia/thrust
+
+[CMake section]: https://nvidia.github.io/thrust/setup/cmake_options.html
+[contributing section]: https://nvidia.github.io/thrust/contributing.html
 
-Development process
--------------------
+[CMake build system]: https://cmake.org
 
-For information on development process and branching, see [this document](doc/branching.md).
+[Apache License v2.0 with LLVM Exceptions]: https://llvm.org/LICENSE.txt
+[Apache License v2.0]: https://www.apache.org/licenses/LICENSE-2.0.txt
+[Boost License v1.0]: https://www.boost.org/LICENSE_1_0.txt
 
diff --git a/ci/axis/cpu.yml b/ci/axis/cpu.yml
new file mode 100644
index 000000000..cc393169d
--- /dev/null
+++ b/ci/axis/cpu.yml
@@ -0,0 +1,61 @@
+# Copyright (c) 2018-2020 NVIDIA Corporation
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+# Released under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+
+SDK_TYPE:
+  - cuda
+
+SDK_VER:
+  - 11.7.0-devel
+
+OS_TYPE:
+  - ubuntu
+
+OS_VER:
+  - 20.04
+
+CXX_TYPE:
+  - clang
+  - gcc
+  - icc
+
+CXX_VER:
+  - 5
+  - 6
+  - 7
+  - 8
+  - 9
+  - 10
+  - 11
+  - 12
+  - latest
+
+exclude:
+  # Excludes by `CXX_VER`.
+  - CXX_TYPE: gcc
+    CXX_VER: 12
+  - CXX_TYPE: gcc
+    CXX_VER: latest
+  - CXX_TYPE: clang
+    CXX_VER: 5
+  - CXX_TYPE: clang
+    CXX_VER: 6
+  - CXX_TYPE: clang
+    CXX_VER: latest
+  - CXX_TYPE: icc
+    CXX_VER: 5
+  - CXX_TYPE: icc
+    CXX_VER: 6
+  - CXX_TYPE: icc
+    CXX_VER: 7
+  - CXX_TYPE: icc
+    CXX_VER: 8
+  - CXX_TYPE: icc
+    CXX_VER: 9
+  - CXX_TYPE: icc
+    CXX_VER: 10
+  - CXX_TYPE: icc
+    CXX_VER: 11
+  - CXX_TYPE: icc
+    CXX_VER: 12
diff --git a/ci/axis/gpu.yml b/ci/axis/gpu.yml
new file mode 100644
index 000000000..550083aab
--- /dev/null
+++ b/ci/axis/gpu.yml
@@ -0,0 +1,22 @@
+# Copyright (c) 2018-2020 NVIDIA Corporation
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+# Released under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+
+SDK_TYPE:
+  - cuda
+
+SDK_VER:
+  - 11.7.0-devel
+
+OS_TYPE:
+  - ubuntu
+
+OS_VER:
+  - 20.04
+
+CXX_TYPE:
+  - gcc
+
+CXX_VER:
+  - 9
diff --git a/ci/common/build.bash b/ci/common/build.bash
new file mode 100755
index 000000000..37aafaf8b
--- /dev/null
+++ b/ci/common/build.bash
@@ -0,0 +1,439 @@
+#! /usr/bin/env bash
+
+# Copyright (c) 2018-2022 NVIDIA Corporation
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+# Released under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+
+################################################################################
+# Thrust and CUB build script for gpuCI
+################################################################################
+
+set -e # Stop on errors.
+
+# append variable value
+# Appends ${value} to ${variable}, adding a space before ${value} if
+# ${variable} is not empty.
+function append {
+  tmp="${!1:+${!1} }${2}"
+  eval "${1}=\${tmp}"
+}
+
+# log args...
+# Prints out ${args[*]} with a gpuCI log prefix and a newline before and after.
+function log() {
+  printf "\n>>>> %s\n\n" "${*}"
+}
+
+# print_with_trailing_blank_line args...
+# Prints ${args[*]} with one blank line following, preserving newlines within
+# ${args[*]} but stripping any preceding ${args[*]}.
+function print_with_trailing_blank_line {
+  printf "%s\n\n" "${*}"
+}
+
+# echo_and_run name args...
+# Echo ${args[@]}, then execute ${args[@]}
+function echo_and_run {
+  echo "${1}: ${@:2}"
+  ${@:2}
+}
+
+# echo_and_run_timed name args...
+# Echo ${args[@]}, then execute ${args[@]} and report how long it took,
+# including ${name} in the output of the time.
+function echo_and_run_timed {
+  echo "${@:2}"
+  TIMEFORMAT=$'\n'"${1} Time: %lR"
+  time ${@:2}
+}
+
+# join_delimit <delimiter> [value [value [...]]]
+# Combine all values into a single string, separating each by a single character
+# delimiter. Eg:
+# foo=(bar baz kramble)
+# joined_foo=$(join_delimit "|" "${foo[@]}")
+# echo joined_foo # "bar|baz|kramble"
+function join_delimit {
+  local IFS="${1}"
+  shift
+  echo "${*}"
+}
+
+################################################################################
+# VARIABLES - Set up bash and environmental variables.
+################################################################################
+
+# Get the variables the Docker container set up for us: ${CXX}, ${CUDACXX}, etc.
+set +e # Don't stop on errors from /etc/cccl.bashrc.
+source /etc/cccl.bashrc
+set -e # Stop on errors.
+
+# Configure sccache.
+if [[ "${CXX_TYPE}" == "nvcxx" ]]; then
+  log "Disabling sccache (nvcxx not supported)"
+  unset ENABLE_SCCACHE
+elif [[ "${BUILD_MODE}" == "pull-request" || "${BUILD_MODE}" == "branch" ]]; then
+  # gpuCI builds cache in S3.
+  export ENABLE_SCCACHE="gpuCI"
+  # Change to 'thrust-aarch64' if we add aarch64 builds to gpuCI:
+  export SCCACHE_S3_KEY_PREFIX=thrust-linux64 # [linux64]
+  export SCCACHE_BUCKET=rapids-sccache-east
+  export SCCACHE_REGION=us-east-2
+  export SCCACHE_IDLE_TIMEOUT=32768
+else
+  export ENABLE_SCCACHE="local"
+  # local builds cache locally
+  export SCCACHE_DIR="${WORKSPACE}/build-sccache"
+fi
+
+# Set sccache compiler flags
+if [[ -n "${ENABLE_SCCACHE}" ]]; then
+  export CMAKE_CUDA_COMPILER_LAUNCHER="sccache"
+  export CMAKE_CXX_COMPILER_LAUNCHER="sccache"
+  export CMAKE_C_COMPILER_LAUNCHER="sccache"
+fi
+
+# Set path.
+export PATH=/usr/local/cuda/bin:${PATH}
+
+# Set home to the job's workspace.
+export HOME=${WORKSPACE}
+
+# Per-process memory util logs:
+MEMMON_LOG=${WORKSPACE}/build/memmon_log
+
+# Switch to the build directory.
+cd ${WORKSPACE}
+mkdir -p build
+cd build
+
+# Remove any old .ninja_log file so the PrintNinjaBuildTimes step is accurate:
+rm -f .ninja_log
+
+if [[ -z "${CMAKE_BUILD_TYPE}" ]]; then
+  CMAKE_BUILD_TYPE="Release"
+fi
+
+CMAKE_BUILD_FLAGS="--"
+
+# The Docker image sets up `${CXX}` and `${CUDACXX}`.
+append CMAKE_FLAGS "-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}"
+append CMAKE_FLAGS "-DCMAKE_CUDA_COMPILER='${CUDACXX}'"
+
+if [[ "${CXX_TYPE}" == "nvcxx" ]]; then
+  # NVC++ isn't properly detected by CMake, so we have to tell CMake to ignore
+  # detection and explicit provide the compiler ID. Ninja currently isn't
+  # supported, so we just use makefiles.
+  append CMAKE_FLAGS "-DCMAKE_CUDA_COMPILER_FORCED=ON"
+  append CMAKE_FLAGS "-DCMAKE_CUDA_COMPILER_ID=NVCXX"
+  # We use NVC++ "slim" image which only contain a single CUDA toolkit version.
+  # When using NVC++ in an environment without GPUs (like our CPU-only
+  # builders) it unfortunately defaults to the oldest CUDA toolkit version it
+  # supports, even if that version is not in the image. So, we have to
+  # explicitly tell NVC++ it which CUDA toolkit version to use.
+  CUDA_VER=$(echo ${SDK_VER} | sed 's/.*\(cuda[0-9]\+\.[0-9]\+\)/\1/')
+  append CMAKE_FLAGS "-DCMAKE_CUDA_FLAGS=-gpu=${CUDA_VER}"
+  # Don't stop on build failures.
+  append CMAKE_BUILD_FLAGS "-k"
+else
+  if [[ "${CXX_TYPE}" == "icc" ]]; then
+    # Only the latest version of the Intel C++ compiler, which NVCC doesn't
+    # officially support yet, is freely available.
+    append CMAKE_FLAGS "-DCMAKE_CUDA_FLAGS=-allow-unsupported-compiler"
+  fi
+  # We're using NVCC so we need to set the host compiler.
+  append CMAKE_FLAGS "-DCMAKE_CXX_COMPILER='${CXX}'"
+  append CMAKE_FLAGS "-G Ninja"
+  # Don't stop on build failures.
+  append CMAKE_BUILD_FLAGS "-k0"
+fi
+
+DETERMINE_PARALLELISM_FLAGS=""
+
+# Used to limit the number of default build threads. Any build/link
+# steps that exceed this limit will cause this script to report a
+# failure. Tune this using the memmon logs printed after each run.
+#
+# Build steps that take more memory than this limit should
+# be split into multiple steps/translation units. Any temporary
+# increases to this threshold should be reverted ASAP. The goal
+# to do decrease this as much as possible and not increase it.
+if [[ -z "${MIN_MEMORY_PER_THREAD}" ]]; then
+  if [[ "${CXX_TYPE}" == "nvcxx" ]]; then
+      MIN_MEMORY_PER_THREAD=3.0 # GiB
+  elif [[ "${CXX_TYPE}" == "icc" ]]; then
+      MIN_MEMORY_PER_THREAD=2.5 # GiB
+  else
+      MIN_MEMORY_PER_THREAD=2.0 # GiB
+  fi
+fi
+append DETERMINE_PARALLELISM_FLAGS "--min-memory-per-thread ${MIN_MEMORY_PER_THREAD}"
+
+if [[ -n "${PARALLEL_LEVEL}" ]]; then
+  append DETERMINE_PARALLELISM_FLAGS "-j ${PARALLEL_LEVEL}"
+fi
+
+# COVERAGE_PLAN options:
+# * Exhaustive
+# * Thorough
+# * Minimal
+if [[ -z "${COVERAGE_PLAN}" ]]; then
+  # `ci/local/build.bash` always sets a coverage plan, so we can assume we're
+  # in gpuCI if one was not set.
+  if [[ "${CXX_TYPE}" == "nvcxx" ]]; then
+    # Today, NVC++ builds take too long to do anything more than Minimal.
+    COVERAGE_PLAN="Minimal"
+  elif [[ "${BUILD_TYPE}" == "cpu" ]] && [[ "${BUILD_MODE}" == "branch" ]]; then
+    # Post-commit CPU CI builds.
+    COVERAGE_PLAN="Exhaustive"
+  elif [[ "${BUILD_TYPE}" == "cpu" ]]; then
+    # Pre-commit CPU CI builds.
+    COVERAGE_PLAN="Thorough"
+  elif [[ "${BUILD_TYPE}" == "gpu" ]]; then
+    # Pre- and post-commit GPU CI builds.
+    COVERAGE_PLAN="Minimal"
+  fi
+fi
+
+case "${COVERAGE_PLAN}" in
+  Exhaustive)
+    append CMAKE_FLAGS "-DTHRUST_ENABLE_MULTICONFIG=ON"
+    append CMAKE_FLAGS "-DTHRUST_IGNORE_DEPRECATED_CPP_11=ON"
+    append CMAKE_FLAGS "-DTHRUST_MULTICONFIG_ENABLE_DIALECT_ALL=ON"
+    append CMAKE_FLAGS "-DTHRUST_MULTICONFIG_ENABLE_SYSTEM_CPP=ON"
+    append CMAKE_FLAGS "-DTHRUST_MULTICONFIG_ENABLE_SYSTEM_TBB=ON"
+    append CMAKE_FLAGS "-DTHRUST_MULTICONFIG_ENABLE_SYSTEM_OMP=ON"
+    append CMAKE_FLAGS "-DTHRUST_MULTICONFIG_ENABLE_SYSTEM_CUDA=ON"
+    append CMAKE_FLAGS "-DTHRUST_INCLUDE_CUB_CMAKE=ON"
+    append CMAKE_FLAGS "-DTHRUST_MULTICONFIG_WORKLOAD=LARGE"
+    ;;
+  Thorough)
+    # Build the legacy bench.cu. We'll probably want to remove this when we
+    # switch to the new, heavier thrust_benchmarks project.
+    append CMAKE_FLAGS "-DTHRUST_ENABLE_BENCHMARKS=ON"
+    append CMAKE_FLAGS "-DTHRUST_ENABLE_MULTICONFIG=ON"
+    append CMAKE_FLAGS "-DTHRUST_IGNORE_DEPRECATED_CPP_11=ON"
+    append CMAKE_FLAGS "-DTHRUST_MULTICONFIG_ENABLE_DIALECT_ALL=ON"
+    append CMAKE_FLAGS "-DTHRUST_MULTICONFIG_ENABLE_SYSTEM_CPP=ON"
+    append CMAKE_FLAGS "-DTHRUST_MULTICONFIG_ENABLE_SYSTEM_TBB=ON"
+    append CMAKE_FLAGS "-DTHRUST_MULTICONFIG_ENABLE_SYSTEM_OMP=ON"
+    append CMAKE_FLAGS "-DTHRUST_MULTICONFIG_ENABLE_SYSTEM_CUDA=ON"
+    append CMAKE_FLAGS "-DTHRUST_MULTICONFIG_WORKLOAD=SMALL"
+    append CMAKE_FLAGS "-DTHRUST_INCLUDE_CUB_CMAKE=ON"
+    append CMAKE_FLAGS "-DTHRUST_AUTO_DETECT_COMPUTE_ARCHS=ON"
+    if [[ "${CXX_TYPE}" != "nvcxx" ]]; then
+      # NVC++ can currently only target one compute architecture at a time.
+      append CMAKE_FLAGS "-DTHRUST_ENABLE_COMPUTE_50=ON"
+      append CMAKE_FLAGS "-DTHRUST_ENABLE_COMPUTE_60=ON"
+      append CMAKE_FLAGS "-DTHRUST_ENABLE_COMPUTE_70=ON"
+    fi
+    append CMAKE_FLAGS "-DTHRUST_ENABLE_COMPUTE_80=ON"
+    ;;
+  Minimal)
+    append CMAKE_FLAGS "-DTHRUST_ENABLE_MULTICONFIG=ON"
+    append CMAKE_FLAGS "-DTHRUST_MULTICONFIG_ENABLE_DIALECT_LATEST=ON"
+    append CMAKE_FLAGS "-DTHRUST_MULTICONFIG_ENABLE_SYSTEM_CPP=ON"
+    append CMAKE_FLAGS "-DTHRUST_MULTICONFIG_ENABLE_SYSTEM_TBB=OFF"
+    append CMAKE_FLAGS "-DTHRUST_MULTICONFIG_ENABLE_SYSTEM_OMP=OFF"
+    append CMAKE_FLAGS "-DTHRUST_MULTICONFIG_ENABLE_SYSTEM_CUDA=ON"
+    append CMAKE_FLAGS "-DTHRUST_MULTICONFIG_WORKLOAD=SMALL"
+    append CMAKE_FLAGS "-DTHRUST_INCLUDE_CUB_CMAKE=ON"
+    append CMAKE_FLAGS "-DTHRUST_AUTO_DETECT_COMPUTE_ARCHS=ON"
+    if [[ "${BUILD_TYPE}" == "cpu" ]] && [[ "${CXX_TYPE}" == "nvcxx" ]]; then
+      # If no GPU is automatically detected, NVC++ insists that you explicitly
+      # provide an architecture.
+      # TODO: This logic should really be moved into CMake, but it will be
+      # tricky to do that until CMake officially supports NVC++.
+      append CMAKE_FLAGS "-DTHRUST_ENABLE_COMPUTE_80=ON"
+    fi
+    ;;
+esac
+
+if [[ -n "${@}" ]]; then
+  append CMAKE_BUILD_FLAGS "${@}"
+fi
+
+append CTEST_FLAGS "--output-on-failure"
+
+CTEST_EXCLUSION_REGEXES=()
+
+if [[ "${BUILD_TYPE}" == "cpu" ]]; then
+  CTEST_EXCLUSION_REGEXES+=("^cub" "^thrust.*cuda")
+fi
+
+if [[ -n "${CTEST_EXCLUSION_REGEXES[@]}" ]]; then
+  CTEST_EXCLUSION_REGEX=$(join_delimit "|" "${CTEST_EXCLUSION_REGEXES[@]}")
+  append CTEST_FLAGS "-E ${CTEST_EXCLUSION_REGEX}"
+fi
+
+if [[ -n "${@}" ]]; then
+  CTEST_INCLUSION_REGEX=$(join_delimit "|" "${@}")
+  append CTEST_FLAGS "-R ^${CTEST_INCLUSION_REGEX[@]}$"
+fi
+
+# Export variables so they'll show up in the logs when we report the environment.
+export COVERAGE_PLAN
+export CMAKE_FLAGS
+export CMAKE_BUILD_FLAGS
+export CTEST_FLAGS
+
+################################################################################
+# ENVIRONMENT - Configure and print out information about the environment.
+################################################################################
+
+log "Determine system topology..."
+
+# Set `${PARALLEL_LEVEL}` if it is unset; otherwise, this just reports the
+# system topology.
+source ${WORKSPACE}/ci/common/determine_build_parallelism.bash ${DETERMINE_PARALLELISM_FLAGS}
+
+log "Get environment..."
+
+env | sort
+
+log "Check versions..."
+
+# We use sed and echo below to ensure there is always one and only trailing
+# line following the output from each tool.
+
+${CXX} --version 2>&1 | sed -Ez '$ s/\n*$/\n/'
+
+echo
+
+${CUDACXX} --version 2>&1 | sed -Ez '$ s/\n*$/\n/'
+
+echo
+
+cmake --version 2>&1 | sed -Ez '$ s/\n*$/\n/'
+
+if [[ "${BUILD_TYPE}" == "gpu" ]]; then
+  echo
+  nvidia-smi 2>&1 | sed -Ez '$ s/\n*$/\n/'
+fi
+
+if [[ -n "${ENABLE_SCCACHE}" ]]; then
+  echo
+  # Set sccache statistics to zero to capture clean run.
+  sccache --version
+  sccache --zero-stats | grep location
+fi
+
+################################################################################
+# BUILD - Build Thrust and CUB examples and tests.
+################################################################################
+
+log "Configure Thrust and CUB..."
+
+echo_and_run_timed "Configure" cmake .. --log-level=VERBOSE ${CMAKE_FLAGS}
+configure_status=$?
+
+log "Build Thrust and CUB..."
+
+# ${PARALLEL_LEVEL} needs to be passed after we run
+# determine_build_parallelism.bash, so it can't be part of ${CMAKE_BUILD_FLAGS}.
+set +e # Don't stop on build failures.
+
+# Monitor memory usage. Thresholds in GiB:
+python3 ${WORKSPACE}/ci/common/memmon.py \
+	--log-threshold 0.0 \
+	--fail-threshold ${MIN_MEMORY_PER_THREAD} \
+	--log-file ${MEMMON_LOG} \
+        &
+memmon_pid=$!
+
+echo_and_run_timed "Build" cmake --build . ${CMAKE_BUILD_FLAGS} -j ${PARALLEL_LEVEL}
+build_status=$?
+
+# Stop memmon:
+kill -s SIGINT ${memmon_pid}
+
+# Re-enable exit on failure:
+set -e
+
+################################################################################
+# TEST - Run Thrust and CUB examples and tests.
+################################################################################
+
+log "Test Thrust and CUB..."
+
+(
+  # Make sure test_status captures ctest, not tee:
+  # https://stackoverflow.com/a/999259/11130318
+  set -o pipefail
+  echo_and_run_timed "Test" ctest ${CTEST_FLAGS} | tee ctest_log
+)
+test_status=$?
+
+################################################################################
+# COMPILATION STATS
+################################################################################
+
+if [[ -n "${ENABLE_SCCACHE}" ]]; then
+  # Get sccache stats after the compile is completed
+  COMPILE_REQUESTS=$(sccache -s | grep "Compile requests \+ [0-9]\+$" | awk '{ print $NF }')
+  CACHE_HITS=$(sccache -s | grep "Cache hits \+ [0-9]\+$" | awk '{ print $NF }')
+  HIT_RATE=$(echo - | awk "{printf \"%.2f\n\", $CACHE_HITS / $COMPILE_REQUESTS * 100}")
+  log "sccache stats (${HIT_RATE}% hit):"
+  sccache -s
+fi
+
+################################################################################
+# COMPILE TIME INFO: Print the 20 longest running build steps (ninja only)
+################################################################################
+
+if [[ -f ".ninja_log" ]]; then
+  log "Checking slowest build steps:"
+  echo_and_run "CompileTimeInfo" cmake -P ../cmake/PrintNinjaBuildTimes.cmake | head -n 23
+fi
+
+################################################################################
+# RUNTIME INFO: Print the 20 longest running test steps
+################################################################################
+
+if [[ -f "ctest_log" ]]; then
+  log "Checking slowest test steps:"
+  echo_and_run "TestTimeInfo" cmake -DLOGFILE=ctest_log -P ../cmake/PrintCTestRunTimes.cmake | head -n 20
+fi
+
+################################################################################
+# MEMORY_USAGE
+################################################################################
+
+memmon_status=0
+if [[ -f "${MEMMON_LOG}" ]]; then
+  log "Checking memmon logfile: ${MEMMON_LOG}"
+
+  if [[ -n "$(grep -E "^FAIL" ${MEMMON_LOG})" ]]; then
+    log "error: Some build steps exceeded memory threshold (${MIN_MEMORY_PER_THREAD} GiB):"
+    grep -E "^FAIL" ${MEMMON_LOG}
+    memmon_status=1
+  else
+    log "Top memory usage per build step (all less than limit of ${MIN_MEMORY_PER_THREAD} GiB):"
+    if [[ -s ${MEMMON_LOG} ]]; then
+      # Not empty:
+      head -n5 ${MEMMON_LOG}
+    else
+      echo "None detected above logging threshold."
+    fi
+  fi
+fi
+
+################################################################################
+# SUMMARY - Print status of each step and exit with failure if needed.
+################################################################################
+
+log "Summary:"
+echo "Warnings:"
+# Not currently a failure; sccache makes these unreliable and intermittent:
+echo "- Build Memory Check: ${memmon_status}"
+echo "Failures:"
+echo "- Configure Error Code: ${configure_status}"
+echo "- Build Error Code: ${build_status}"
+echo "- Test Error Code: ${test_status}"
+
+if [[ "${configure_status}" != "0" ]] || \
+   [[ "${build_status}" != "0" ]] || \
+   [[ "${test_status}" != "0" ]]; then
+     exit 1
+fi
diff --git a/ci/common/determine_build_parallelism.bash b/ci/common/determine_build_parallelism.bash
new file mode 100755
index 000000000..9813fcb2f
--- /dev/null
+++ b/ci/common/determine_build_parallelism.bash
@@ -0,0 +1,119 @@
+#! /usr/bin/env bash
+
+# Copyright (c) 2018-2020 NVIDIA Corporation
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+# Released under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+
+function usage {
+  echo "Usage: ${0} [flags...]"
+  echo
+  echo "Examine the system topology to determine a reasonable amount of build"
+  echo "parallelism."
+  echo
+  echo "Exported variables:"
+  echo "  \${LOGICAL_CPUS}          : Logical processors (e.g. threads)."
+  echo "  \${PHYSICAL_CPUS}         : Physical processors (e.g. cores)."
+  echo "  \${TOTAL_MEM}             : Total system memory [GB]."
+  echo "  \${MAX_THREADS_PER_CORE}  : Maximum threads per core allowed."
+  echo "  \${MIN_MEMORY_PER_THREAD} : Minimum memory [GB] per thread allowed."
+  echo "  \${CPU_BOUND_THREADS}     : # of build threads constrained by processors."
+  echo "  \${MEM_BOUND_THREADS}     : # of build threads constrained by memory [GB]."
+  echo "  \${PARALLEL_LEVEL}        : Determined # of build threads."
+  echo "  \${MEM_PER_THREAD}        : Memory [GB] per build thread."
+  echo
+  echo "-h, -help, --help"
+  echo "  Print this message."
+  echo
+  echo "-q, --quiet"
+  echo "  Print nothing and only export variables."
+  echo
+  echo "-j <threads>, --jobs <threads>"
+  echo "  Explicitly set the number of build threads to use."
+  echo
+  echo "--max-threads-per-core <threads>"
+  echo "  Specify the maximum threads per core allowed (default: ${MAX_THREADS_PER_CORE} [threads/core])."
+  echo
+  echo "--min-memory-per-thread <gigabytes>"
+  echo "  Specify the minimum memory per thread allowed (default: ${MIN_MEMORY_PER_THREAD} [GBs/thread])."
+
+  exit -3
+}
+
+QUIET=0
+
+export MAX_THREADS_PER_CORE=2
+export MIN_MEMORY_PER_THREAD=4 # [GB]
+
+while test ${#} != 0
+do
+  case "${1}" in
+  -h) ;&
+  -help) ;&
+  --help) usage ;;
+  -q) ;&
+  --quiet) QUIET=1 ;;
+  -j) ;&
+  --jobs)
+    shift # The next argument is the number of threads.
+    PARALLEL_LEVEL="${1}"
+    ;;
+  --max-threads-per-core)
+    shift # The next argument is the number of threads per core.
+    MAX_THREADS_PER_CORE="${1}"
+    ;;
+  --min-memory-per-thread)
+    shift # The next argument is the amount of memory per thread.
+    MIN_MEMORY_PER_THREAD="${1}"
+    ;;
+  esac
+  shift
+done
+
+# https://stackoverflow.com/a/23378780
+if [ $(uname) == "Darwin" ]; then
+  export LOGICAL_CPUS=$(sysctl -n hw.logicalcpu_max)
+  export PHYSICAL_CPUS=$(sysctl -n hw.physicalcpu_max)
+else
+  export LOGICAL_CPUS=$(lscpu -p | egrep -v '^#' | wc -l)
+  export PHYSICAL_CPUS=$(lscpu -p | egrep -v '^#' | sort -u -t, -k 2,4 | wc -l)
+fi
+
+export TOTAL_MEM=$(awk "BEGIN { printf \"%0.4g\", $(grep MemTotal /proc/meminfo | awk '{ print $2 }') / (1024 * 1024) }")
+
+export CPU_BOUND_THREADS=$(awk "BEGIN { printf \"%.04g\", int(${PHYSICAL_CPUS} * ${MAX_THREADS_PER_CORE}) }")
+export MEM_BOUND_THREADS=$(awk "BEGIN { printf \"%.04g\", int(${TOTAL_MEM} / ${MIN_MEMORY_PER_THREAD}) }")
+
+if [[ -z "${PARALLEL_LEVEL}" ]]; then
+  # Pick the smaller of the two as the default.
+  if [[ "${MEM_BOUND_THREADS}" -lt "${CPU_BOUND_THREADS}" ]]; then
+    export PARALLEL_LEVEL=${MEM_BOUND_THREADS}
+  else
+    export PARALLEL_LEVEL=${CPU_BOUND_THREADS}
+  fi
+else
+  EXPLICIT_PARALLEL_LEVEL=1
+fi
+
+# This can be a floating point number.
+export MEM_PER_THREAD=$(awk "BEGIN { printf \"%.04g\", ${TOTAL_MEM} / ${PARALLEL_LEVEL} }")
+
+if [[ "${QUIET}" == 0 ]]; then
+  echo    "Logical CPUs:           ${LOGICAL_CPUS} [threads]"
+  echo    "Physical CPUs:          ${PHYSICAL_CPUS} [cores]"
+  echo    "Total Mem:              ${TOTAL_MEM} [GBs]"
+  echo    "Max Threads Per Core:   ${MAX_THREADS_PER_CORE} [threads/core]"
+  echo    "Min Memory Per Threads: ${MIN_MEMORY_PER_THREAD} [GBs/thread]"
+  echo    "CPU Bound Threads:      ${CPU_BOUND_THREADS} [threads]"
+  echo    "Mem Bound Threads:      ${MEM_BOUND_THREADS} [threads]"
+
+  echo -n "Parallel Level:         ${PARALLEL_LEVEL} [threads]"
+  if [[ -n "${EXPLICIT_PARALLEL_LEVEL}" ]]; then
+    echo " (explicitly set)"
+  else
+    echo
+  fi
+
+  echo    "Mem Per Thread:         ${MEM_PER_THREAD} [GBs/thread]"
+fi
+
diff --git a/ci/common/memmon.py b/ci/common/memmon.py
new file mode 100755
index 000000000..505503733
--- /dev/null
+++ b/ci/common/memmon.py
@@ -0,0 +1,110 @@
+#! /usr/bin/env python
+
+# Copyright (c) 2022 NVIDIA Corporation
+# Reply-To: Allison Vacanti <alliepiper16@gmail.com>
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+# Released under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+
+help_text = """%(prog)s [reference.json compare.json | reference_dir/ compare_dir/]
+
+This script:
+
+1. Runs `top -bco RES`, continuously extracting the memory usage of each process.
+2. If a process uses more than `log_threshold` GiB and exceeds any other recorded
+   entry for the process, it is stored in `entries`.
+3. When this script receives SIGINT, it writes two files:
+  * `log_file` will contain all recorded max-memory-per-process entries
+  * `fail_file` will contain all entries that exceed `fail_threshold`
+"""
+
+import argparse
+import os
+import re
+import signal
+import sys
+
+from subprocess import Popen, PIPE, STDOUT
+
+parser = argparse.ArgumentParser(prog='memmon.py', usage=help_text)
+parser.add_argument('--log-threshold', type=float, dest='log_threshold',
+                    default=0.5,
+                    help='Logging threshold in GiB.')
+parser.add_argument('--fail-threshold', type=float, dest='fail_threshold',
+                    default=2,
+                    help='Failure threshold in GiB.')
+parser.add_argument('--log-file', type=str, dest='log_file', default='memmon_log',
+                    help='Output file for log entries.')
+args, unused = parser.parse_known_args()
+
+entries = {}
+
+
+def signal_handler(sig, frame):
+    # Sort by mem:
+    sortentries = sorted(entries.items(), key=lambda x: x[1], reverse=True)
+
+    lf = open(args.log_file, "w")
+
+    for com, mem in sortentries:
+        status = "PASS"
+        if mem >= args.fail_threshold:
+            status = "FAIL"
+        line = "%4s | %3.1f GiB | %s\n" % (status, mem, com)
+        lf.write(line)
+
+    lf.close()
+    sys.exit(0)
+
+
+signal.signal(signal.SIGINT, signal_handler)
+
+# Find the toprc config file and configure top's env.
+# This config:
+# - Hides all columns except for RES and COMMAND
+# - Sorts by RES
+# - Enables long command strings (-c)
+script_dir = os.path.dirname(os.path.realpath(__file__))
+config_dir = os.path.join(script_dir, 'memmon_config')
+
+proc = Popen(["top", "-b", "-w", "512"],
+             stdin=PIPE, stdout=PIPE, stderr=STDOUT,
+             env={"XDG_CONFIG_HOME": config_dir})
+
+regex = re.compile("^\\s*([0-9.]+[kmgtp]?)\\s+(.+)\\s*$")
+
+
+# Convert a memory string from top into floating point GiB
+def parse_mem(mem_str):
+    if mem_str[-1] == "k":
+        return float(mem_str[:-1]) / (1024 * 1024)
+    elif mem_str[-1] == "m":
+        return float(mem_str[:-1]) / (1024)
+    elif mem_str[-1] == "g":
+        return float(mem_str[:-1])
+    elif mem_str[-1] == "t":
+        return float(mem_str[:-1]) * 1024
+    elif mem_str[-1] == "p":  # please no
+        return float(mem_str[:-1]) * 1024 * 1024
+    # bytes:
+    return float(mem_str) / (1024 * 1024 * 1024)
+
+
+for line in proc.stdout:
+    line = line.decode()
+    match = regex.match(line)
+    if match:
+        mem = parse_mem(match.group(1))
+        if mem < args.log_threshold and mem < args.fail_threshold:
+            continue
+        com = match.group(2)
+        if com in entries and entries[com] > mem:
+            continue
+        if mem >= args.fail_threshold:
+            # Print a notice immediately -- this helps identify the failures
+            # as they happen, since `com` may not provide enough info.
+            print("memmon.py failure: Build step exceed memory threshold:\n"
+                  "  - Threshold: %3.1f GiB\n"
+                  "  - Usage:     %3.1f GiB\n"
+                  "  - Command:   %s" % (args.fail_threshold, mem, com))
+        entries[com] = mem
diff --git a/ci/common/memmon_config/procps/toprc b/ci/common/memmon_config/procps/toprc
new file mode 100644
index 000000000..883a482ce
--- /dev/null
+++ b/ci/common/memmon_config/procps/toprc
@@ -0,0 +1,16 @@
+top's Config File (Linux processes with windows)
+Id:i, Mode_altscr=0, Mode_irixps=1, Delay_time=3.0, Curwin=0
+Def	fieldscur=%(34;�@D7:9�&')*+,-./012568<>?ABCFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghij
+	winflags=193972, sortindx=18, maxtasks=0, graph_cpus=0, graph_mems=0
+	summclr=1, msgsclr=1, headclr=3, taskclr=1
+Job	fieldscur=�����(��Ļ�@<��)*+,-./012568>?ABCFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghij
+	winflags=193844, sortindx=0, maxtasks=0, graph_cpus=0, graph_mems=0
+	summclr=6, msgsclr=6, headclr=7, taskclr=6
+Mem	fieldscur=���<�����MBN�D34��&'()*+,-./0125689FGHIJKLOPQRSTUVWXYZ[\]^_`abcdefghij
+	winflags=193844, sortindx=21, maxtasks=0, graph_cpus=0, graph_mems=0
+	summclr=5, msgsclr=5, headclr=4, taskclr=5
+Usr	fieldscur=�����������)+,-./1234568;<=>?@ABCFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghij
+	winflags=193844, sortindx=3, maxtasks=0, graph_cpus=0, graph_mems=0
+	summclr=3, msgsclr=3, headclr=2, taskclr=3
+Fixed_widest=0, Summ_mscale=1, Task_mscale=0, Zero_suppress=0
+
diff --git a/ci/cpu/build.bash b/ci/cpu/build.bash
new file mode 100755
index 000000000..69b99bbec
--- /dev/null
+++ b/ci/cpu/build.bash
@@ -0,0 +1,14 @@
+#! /usr/bin/env bash
+
+# Copyright (c) 2018-2020 NVIDIA Corporation
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+# Released under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+
+################################################################################
+# Thrust and CUB build script for gpuCI (CPU-only)
+################################################################################
+
+export PARALLEL_LEVEL=${PARALLEL_LEVEL:-4}
+
+source ${WORKSPACE}/ci/common/build.bash
diff --git a/ci/gpu/build.bash b/ci/gpu/build.bash
new file mode 100755
index 000000000..f6cdf021c
--- /dev/null
+++ b/ci/gpu/build.bash
@@ -0,0 +1,14 @@
+#! /usr/bin/env bash
+
+# Copyright (c) 2018-2020 NVIDIA Corporation
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+# Released under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+
+################################################################################
+# Thrust and CUB build script for gpuCI (heterogeneous)
+################################################################################
+
+export PARALLEL_LEVEL=${PARALLEL_LEVEL:-4}
+
+source ${WORKSPACE}/ci/common/build.bash
diff --git a/ci/local/build.bash b/ci/local/build.bash
new file mode 100755
index 000000000..8b20ef063
--- /dev/null
+++ b/ci/local/build.bash
@@ -0,0 +1,224 @@
+#! /usr/bin/env bash
+
+# Copyright (c) 2018-2020 NVIDIA Corporation
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+# Released under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+
+################################################################################
+# Thrust and CUB local containerized build script
+################################################################################
+
+function usage {
+  echo "Usage: ${0} [flags...] [cmake-targets...]"
+  echo
+  echo "Build and test your local repository using a gpuCI Docker image."
+  echo "If CMake targets are specified, only those targets are built and tested."
+  echo "Otherwise, everything is built and tested."
+  echo
+  echo "-h, -help, --help"
+  echo "  Print this message."
+  echo
+  echo "-r <path>, --repository <path>"
+  echo "  Path to the repository (default: ${REPOSITORY_PATH})."
+  echo
+  echo "-i <image>, --image <image>"
+  echo "  Docker image to use (default: ${IMAGE})"
+  echo
+  echo "-l, --local-image"
+  echo "  Use the local version of the image instead of pulling from Docker hub."
+  echo
+  echo "-s, --shell-only"
+  echo "  Skip building and testing and launch an interactive shell instead."
+  echo
+  echo "-d, --disable-gpus"
+  echo "  Don't start the container with the NVIDIA runtime and GPUs attached."
+  echo
+  echo "-c, --clean"
+  echo "  If the build directory already exists, delete it."
+  echo
+  echo "-j <threads>, --jobs <threads>"
+  echo "  Number of threads to use when building (default: inferred)."
+  echo
+  echo "-b <type>, --cmake-build-type <plan>"
+  echo "  CMake build type to use, either Release, RelWithDebInfo, or Debug"
+  echo "  (default: ${CMAKE_BUILD_TYPE})."
+  echo
+  echo "-p <plan>, --coverage-plan <plan>"
+  echo "  Coverage plan to use, either Exhaustive, Thorough, or Minimal"
+  echo "  (default: ${COVERAGE_PLAN})."
+  echo
+
+  exit -3
+}
+
+SCRIPT_PATH=$(cd $(dirname ${0}); pwd -P)
+
+REPOSITORY_PATH=$(realpath ${SCRIPT_PATH}/../..)
+
+################################################################################
+# FLAGS - Process command line flags.
+################################################################################
+
+IMAGE="gpuci/cccl:cuda11.7.0-devel-ubuntu20.04-gcc9"
+
+LOCAL_IMAGE=0
+
+SHELL_ONLY=0
+
+BUILD_TYPE="gpu"
+
+CLEAN=0
+
+PARALLEL_LEVEL=""
+
+CMAKE_BUILD_TYPE="Release"
+
+COVERAGE_PLAN="Minimal"
+
+TARGETS=""
+
+while test ${#} != 0
+do
+  case "${1}" in
+  -h) ;&
+  -help) ;&
+  --help) usage ;;
+  -r) ;&
+  --repository)
+    shift # The next argument is the path.
+    REPOSITORY_PATH="${1}"
+    ;;
+  -i) ;&
+  --image)
+    shift # The next argument is the image.
+    IMAGE="${1}"
+    ;;
+  -l) ;&
+  --local-image) LOCAL_IMAGE=1 ;;
+  -s) ;&
+  --shell-only) SHELL_ONLY=1 ;;
+  -d) ;&
+  --disable-gpus) BUILD_TYPE="cpu" ;;
+  -c) ;&
+  --clean) CLEAN=1 ;;
+  -j) ;&
+  --jobs)
+    shift # The next argument is the number of threads.
+    PARALLEL_LEVEL="${1}"
+    ;;
+  -b) ;&
+  --cmake-build-type)
+    shift # The next argument is the build type.
+    CMAKE_BUILD_TYPE="${1}"
+    ;;
+  -p) ;&
+  --coverage-plan)
+    shift # The next argument is the coverage plan.
+    COVERAGE_PLAN="${1}"
+    ;;
+  *)
+    TARGETS="${TARGETS:+${TARGETS} }${1}"
+    ;;
+  esac
+  shift
+done
+
+################################################################################
+# PATHS - Setup paths for the container.
+################################################################################
+
+# ${REPOSITORY_PATH} is the local filesystem path to the Git repository being
+# built and tested. It can be set with the --repository flag.
+#
+# ${BUILD_PATH} is the local filesystem path that will be used for the build. It
+# is named after the image name, allowing multiple image builds to coexist on
+# the local filesystem.
+#
+# ${REPOSITORY_PATH_IN_CONTAINER} is the location of ${REPOSITORY_PATH} inside
+# the container.
+#
+# ${BUILD_PATH_IN_CONTAINER} is the location of ${BUILD_PATH} inside the
+# container.
+
+BUILD_PATH=${REPOSITORY_PATH}/build_$(echo "$(basename "${IMAGE}")" | sed -e 's/:/_/g' | sed -e 's/-/_/g')
+
+if [[ "${CLEAN}" != 0 ]]; then
+  rm -rf ${BUILD_PATH}
+fi
+
+mkdir -p ${BUILD_PATH}
+
+BASE_PATH_IN_CONTAINER="/cccl"
+
+REPOSITORY_PATH_IN_CONTAINER="${BASE_PATH_IN_CONTAINER}/$(basename "${REPOSITORY_PATH}")"
+
+BUILD_PATH_IN_CONTAINER="${BASE_PATH_IN_CONTAINER}/$(basename "${REPOSITORY_PATH}")/build"
+
+################################################################################
+# ENVIRONMENT - Setup the thunk build script that will be run by the container.
+################################################################################
+
+# We have to run `ldconfig` to rebuild `ld.so.cache` to work around this
+# failure on Debian: https://github.com/NVIDIA/nvidia-docker/issues/1399
+
+COMMAND="sudo ldconfig; sudo ldconfig"
+if [[ "${SHELL_ONLY}" != 0 ]]; then
+  COMMAND="${COMMAND}; bash"
+else
+  COMMAND="${COMMAND}; ${REPOSITORY_PATH_IN_CONTAINER}/ci/common/build.bash ${TARGETS} || bash"
+fi
+
+################################################################################
+# GPU - Setup GPUs.
+################################################################################
+
+if [[ "${BUILD_TYPE}" == "gpu" ]]; then
+  # Limit GPUs available to the container based on ${CUDA_VISIBLE_DEVICES}.
+  if [[ -z "${CUDA_VISIBLE_DEVICES}" ]]; then
+    VISIBLE_DEVICES="all"
+  else
+    VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES}"
+  fi
+
+  DOCKER_MAJOR_VER=$(docker -v | sed 's/[^[0-9]*\([0-9]*\).*/\1/')
+  GPU_OPTS="--gpus device=${VISIBLE_DEVICES}"
+  if [[ "${DOCKER_MAJOR_VER}" -lt 19 ]]
+  then
+    GPU_OPTS="--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES='${VISIBLE_DEVICES}'"
+  fi
+fi
+
+################################################################################
+# LAUNCH - Pull and launch the container.
+################################################################################
+
+NVIDIA_DOCKER_INSTALLED=$(docker info 2>&1 | grep -i runtime | grep -c nvidia)
+if [[ "${NVIDIA_DOCKER_INSTALLED}" == 0 ]]; then
+  echo "NVIDIA Docker not found, the build may fail."
+  echo "Please install it if you encounter issues: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#installing-docker-ce"
+fi
+
+if [[ "${LOCAL_IMAGE}" == 0 ]]; then
+  docker pull "${IMAGE}"
+fi
+
+docker run --rm -it ${GPU_OPTS} \
+  --cap-add=SYS_PTRACE \
+  --user "$(id -u)":"$(id -g)" \
+  -v "${REPOSITORY_PATH}":"${REPOSITORY_PATH_IN_CONTAINER}" \
+  -v "${BUILD_PATH}":"${BUILD_PATH_IN_CONTAINER}" \
+  -v /etc/passwd:/etc/passwd:ro \
+  -v /etc/group:/etc/group:ro \
+  -v /etc/subuid:/etc/subuid:ro \
+  -v /etc/subgid:/etc/subgid:ro \
+  -v /etc/shadow:/etc/shadow:ro \
+  -v /etc/gshadow:/etc/gshadow:ro \
+  -e "WORKSPACE=${REPOSITORY_PATH_IN_CONTAINER}" \
+  -e "BUILD_TYPE=${BUILD_TYPE}" \
+  -e "CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}" \
+  -e "COVERAGE_PLAN=${COVERAGE_PLAN}" \
+  -e "PARALLEL_LEVEL=${PARALLEL_LEVEL}" \
+  -w "${BUILD_PATH_IN_CONTAINER}" \
+  "${IMAGE}" bash -c "${COMMAND}"
+
diff --git a/cmake/AppendOptionIfAvailable.cmake b/cmake/AppendOptionIfAvailable.cmake
index 8df9f4a33..52dc12216 100644
--- a/cmake/AppendOptionIfAvailable.cmake
+++ b/cmake/AppendOptionIfAvailable.cmake
@@ -3,7 +3,7 @@ include(CheckCXXCompilerFlag)
 
 macro (APPEND_OPTION_IF_AVAILABLE _FLAG _LIST)
 
-set(_VAR "CXX_FLAG_${_FLAG}")
+string(MAKE_C_IDENTIFIER "CXX_FLAG_${_FLAG}" _VAR)
 check_cxx_compiler_flag(${_FLAG} ${_VAR})
 
 if (${${_VAR}})
diff --git a/cmake/CheckCXXCompilerFlag.cmake b/cmake/CheckCXXCompilerFlag.cmake
deleted file mode 100644
index 87df0be8e..000000000
--- a/cmake/CheckCXXCompilerFlag.cmake
+++ /dev/null
@@ -1,64 +0,0 @@
-# Distributed under the OSI-approved BSD 3-Clause License.  See accompanying
-# file Copyright.txt or https://cmake.org/licensing for details.
-
-#[=======================================================================[.rst:
-CheckCXXCompilerFlag
-------------------------
-
-Check whether the CXX compiler supports a given flag.
-
-.. command:: check_cxx_compiler_flag
-
-  ::
-
-    check_cxx_compiler_flag(<flag> <var>)
-
-  Check that the ``<flag>`` is accepted by the compiler without
-  a diagnostic.  Stores the result in an internal cache entry
-  named ``<var>``.
-
-This command temporarily sets the ``CMAKE_REQUIRED_DEFINITIONS`` variable
-and calls the ``check_cxx_source_compiles`` macro from the
-:module:`CheckCXXSourceCompiles` module.  See documentation of that
-module for a listing of variables that can otherwise modify the build.
-
-A positive result from this check indicates only that the compiler did not
-issue a diagnostic message when given the flag.  Whether the flag has any
-effect or even a specific one is beyond the scope of this module.
-
-.. note::
-  Since the :command:`try_compile` command forwards flags from variables
-  like :variable:`CMAKE_CXX_FLAGS <CMAKE_<LANG>_FLAGS>`, unknown flags
-  in such variables may cause a false negative for this check.
-#]=======================================================================]
-
-include_guard(GLOBAL)
-include(CheckCXXSourceCompiles)
-include(CMakeCheckCompilerFlagCommonPatterns)
-
-macro (CHECK_CXX_COMPILER_FLAG _FLAG _RESULT)
-   set(SAFE_CMAKE_REQUIRED_DEFINITIONS "${CMAKE_REQUIRED_DEFINITIONS}")
-   set(CMAKE_REQUIRED_DEFINITIONS "${_FLAG}")
-
-   # Normalize locale during test compilation.
-   set(_CheckCXXCompilerFlag_LOCALE_VARS LC_ALL LC_MESSAGES LANG)
-   foreach(v ${_CheckCXXCompilerFlag_LOCALE_VARS})
-     set(_CheckCXXCompilerFlag_SAVED_${v} "$ENV{${v}}")
-     set(ENV{${v}} C)
-   endforeach()
-   CHECK_COMPILER_FLAG_COMMON_PATTERNS(_CheckCXXCompilerFlag_COMMON_PATTERNS)
-   CHECK_CXX_SOURCE_COMPILES("int main() { return 0; }" "${_RESULT}" "CXX flag ${_FLAG}"
-     # Some compilers do not fail with a bad flag
-     FAIL_REGEX "command line option .* is valid for .* but not for C\\\\+\\\\+" # GNU
-     ${_CheckCXXCompilerFlag_COMMON_PATTERNS}
-     )
-   foreach(v ${_CheckCXXCompilerFlag_LOCALE_VARS})
-     set(ENV{${v}} ${_CheckCXXCompilerFlag_SAVED_${v}})
-     unset(_CheckCXXCompilerFlag_SAVED_${v})
-   endforeach()
-   unset(_CheckCXXCompilerFlag_LOCALE_VARS)
-   unset(_CheckCXXCompilerFlag_COMMON_PATTERNS)
-
-   set (CMAKE_REQUIRED_DEFINITIONS "${SAFE_CMAKE_REQUIRED_DEFINITIONS}")
-endmacro ()
-
diff --git a/cmake/CheckCXXSourceCompiles.cmake b/cmake/CheckCXXSourceCompiles.cmake
deleted file mode 100644
index 38e915c27..000000000
--- a/cmake/CheckCXXSourceCompiles.cmake
+++ /dev/null
@@ -1,135 +0,0 @@
-# Distributed under the OSI-approved BSD 3-Clause License.  See accompanying
-# file Copyright.txt or https://cmake.org/licensing for details.
-
-#[=======================================================================[.rst:
-CheckCXXSourceCompiles
-----------------------
-
-Check if given C++ source compiles and links into an executable.
-
-.. command:: check_cxx_source_compiles
-
-  ::
-
-    check_cxx_source_compiles(code resultVar [FAIL_REGEX regex1 [regex2...]])
-
-  Check that the source supplied in ``code`` can be compiled as a C++ source
-  file and linked as an executable (so it must contain at least a ``main()``
-  function). The result will be stored in the internal cache variable specified
-  by ``resultVar``, with a boolean true value for success and boolean false for
-  failure. If ``FAIL_REGEX`` is provided, then failure is determined by
-  checking if anything in the output matches any of the specified regular
-  expressions.
-
-  The underlying check is performed by the :command:`try_compile` command. The
-  compile and link commands can be influenced by setting any of the following
-  variables prior to calling ``check_cxx_source_compiles()``:
-
-  ``CMAKE_REQUIRED_FLAGS``
-    Additional flags to pass to the compiler. Note that the contents of
-    :variable:`CMAKE_CXX_FLAGS <CMAKE_<LANG>_FLAGS>` and its associated
-    configuration-specific variable are automatically added to the compiler
-    command before the contents of ``CMAKE_REQUIRED_FLAGS``.
-
-  ``CMAKE_REQUIRED_DEFINITIONS``
-    A :ref:`;-list <CMake Language Lists>` of compiler definitions of the form
-    ``-DFOO`` or ``-DFOO=bar``. A definition for the name specified by
-    ``resultVar`` will also be added automatically.
-
-  ``CMAKE_REQUIRED_INCLUDES``
-    A :ref:`;-list <CMake Language Lists>` of header search paths to pass to
-    the compiler. These will be the only header search paths used by
-    ``try_compile()``, i.e. the contents of the :prop_dir:`INCLUDE_DIRECTORIES`
-    directory property will be ignored.
-
-  ``CMAKE_REQUIRED_LIBRARIES``
-    A :ref:`;-list <CMake Language Lists>` of libraries to add to the link
-    command. These can be the name of system libraries or they can be
-    :ref:`Imported Targets <Imported Targets>` (see :command:`try_compile` for
-    further details).
-
-  ``CMAKE_REQUIRED_QUIET``
-    If this variable evaluates to a boolean true value, all status messages
-    associated with the check will be suppressed.
-
-  The check is only performed once, with the result cached in the variable
-  named by ``resultVar``. Every subsequent CMake run will re-use this cached
-  value rather than performing the check again, even if the ``code`` changes.
-  In order to force the check to be re-evaluated, the variable named by
-  ``resultVar`` must be manually removed from the cache.
-
-#]=======================================================================]
-
-include_guard(GLOBAL)
-
-macro(CHECK_CXX_SOURCE_COMPILES SOURCE VAR NAME)
-  if(NOT DEFINED "${VAR}")
-    set(_FAIL_REGEX)
-    set(_key)
-    foreach(arg ${ARGN})
-      if("${arg}" MATCHES "^(FAIL_REGEX)$")
-        set(_key "${arg}")
-      elseif(_key)
-        list(APPEND _${_key} "${arg}")
-      else()
-        message(FATAL_ERROR "Unknown argument:\n  ${arg}\n")
-      endif()
-    endforeach()
-
-    set(MACRO_CHECK_FUNCTION_DEFINITIONS
-      "${CMAKE_REQUIRED_FLAGS}")
-    if(CMAKE_REQUIRED_LIBRARIES)
-      set(CHECK_CXX_SOURCE_COMPILES_ADD_LIBRARIES
-        LINK_LIBRARIES ${CMAKE_REQUIRED_LIBRARIES})
-    else()
-      set(CHECK_CXX_SOURCE_COMPILES_ADD_LIBRARIES)
-    endif()
-    if(CMAKE_REQUIRED_INCLUDES)
-      set(CHECK_CXX_SOURCE_COMPILES_ADD_INCLUDES
-        "-DINCLUDE_DIRECTORIES:STRING=${CMAKE_REQUIRED_INCLUDES}")
-    else()
-      set(CHECK_CXX_SOURCE_COMPILES_ADD_INCLUDES)
-    endif()
-    file(WRITE "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/src.cxx"
-      "${SOURCE}\n")
-
-    if(NOT CMAKE_REQUIRED_QUIET)
-      message(STATUS "Testing ${NAME}")
-    endif()
-    try_compile(${VAR}
-      ${CMAKE_BINARY_DIR}
-      ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp/src.cxx
-      COMPILE_DEFINITIONS ${CMAKE_REQUIRED_DEFINITIONS}
-      ${CHECK_CXX_SOURCE_COMPILES_ADD_LIBRARIES}
-      CMAKE_FLAGS -DCOMPILE_DEFINITIONS:STRING=${MACRO_CHECK_FUNCTION_DEFINITIONS}
-      "${CHECK_CXX_SOURCE_COMPILES_ADD_INCLUDES}"
-      OUTPUT_VARIABLE OUTPUT)
-
-    foreach(_regex ${_FAIL_REGEX})
-      if("${OUTPUT}" MATCHES "${_regex}")
-        set(${VAR} 0)
-      endif()
-    endforeach()
-
-    if(${VAR})
-      set(${VAR} 1 CACHE INTERNAL "Test ${NAME}")
-      if(NOT CMAKE_REQUIRED_QUIET)
-        message(STATUS "Testing ${NAME} - Success")
-      endif()
-      file(APPEND ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeOutput.log
-        "Performing C++ SOURCE FILE Test ${NAME} succeeded with the following output:\n"
-        "${OUTPUT}\n"
-        "Source file was:\n${SOURCE}\n")
-    else()
-      if(NOT CMAKE_REQUIRED_QUIET)
-        message(STATUS "Testing ${NAME} - Failed")
-      endif()
-      set(${VAR} "" CACHE INTERNAL "Test ${NAME}")
-      file(APPEND ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeError.log
-        "Performing C++ SOURCE FILE Test ${NAME} failed with the following output:\n"
-        "${OUTPUT}\n"
-        "Source file was:\n${SOURCE}\n")
-    endif()
-  endif()
-endmacro()
-
diff --git a/cmake/DetectSupportedStandards.cmake b/cmake/DetectSupportedStandards.cmake
new file mode 100644
index 000000000..5dceefdab
--- /dev/null
+++ b/cmake/DetectSupportedStandards.cmake
@@ -0,0 +1,47 @@
+# Detect the langauge standards supported by the current compilers.
+#
+# Usage: detect_supported_cxx_standards(<var_prefix> <lang> <standards>)
+#
+# - var_prefix: Used to name result variables,
+#   e.g. ${var_prefix}_${lang}_XX_SUPPORTED will be TRUE or FALSE. Defined for
+#   each XX in ${standards}.
+# - lang: The language to test: C, CXX, or CUDA.
+# - standards: List of any standard versions.
+#
+# Example: detect_supported_cxx_standards(PROJ CXX 11 14 17)
+#   - Sets the following variables in the parent scope to TRUE or FALSE:
+#     - PROJ_CXX_11_SUPPORTED
+#     - PROJ_CXX_14_SUPPORTED
+#     - PROJ_CXX_17_SUPPORTED
+#
+function(detect_supported_standards prefix lang)
+  string(TOLOWER "${lang}_std" feature_prefix)
+  foreach(standard IN LISTS ARGN)
+    set(var_name "${prefix}_${lang}_${standard}_SUPPORTED")
+    if ("${feature_prefix}_${standard}" IN_LIST CMAKE_${lang}_COMPILE_FEATURES)
+      set(${var_name} TRUE)
+    else()
+      set(${var_name} FALSE)
+    endif()
+
+
+    if (standard EQUAL 17 AND
+        (lang STREQUAL "CXX" OR lang STREQUAL "CUDA") AND
+        ((CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND
+          CMAKE_CXX_COMPILER_VERSION VERSION_LESS 7) OR
+         (CMAKE_CXX_COMPILER_ID STREQUAL "Clang" AND
+          CMAKE_CXX_COMPILER_VERSION VERSION_LESS 8)))
+      # Special cases:
+      # gcc < 7 and clang < 8 don't fully support C++17.
+      # They accept the flag and have partial support, but nvcc will refuse
+      # to enable it and falls back to the default dialect for the current
+      # CXX compiler version. This breaks our CI.
+      # CMake's COMPILE_FEATURES var reports that these compilers support C++17,
+      # but we can't rely on it, so manually disable the dialect in these cases.
+      set(${var_name} FALSE)
+    endif()
+
+    message(STATUS "Testing ${lang}${standard} Support: ${${var_name}}")
+    set(${var_name} ${${var_name}} PARENT_SCOPE)
+  endforeach()
+endfunction()
diff --git a/cmake/PrintCTestRunTimes.cmake b/cmake/PrintCTestRunTimes.cmake
new file mode 100644
index 000000000..bf23b9bb6
--- /dev/null
+++ b/cmake/PrintCTestRunTimes.cmake
@@ -0,0 +1,109 @@
+## This CMake script parses the output of ctest and prints a formatted list
+## of individual test runtimes, sorted longest first.
+##
+## ctest > ctest_log
+## cmake -DLOGFILE=ctest_log \
+##       -P PrintCTestRunTimes.cmake
+##
+################################################################################
+
+cmake_minimum_required(VERSION 3.15)
+
+# Prepend the string with "0" until the string length equals the specified width
+function(pad_string_with_zeros string_var width)
+  set(local_string "${${string_var}}")
+  string(LENGTH "${local_string}" size)
+  while(size LESS width)
+    string(PREPEND local_string "0")
+    string(LENGTH "${local_string}" size)
+  endwhile()
+  set(${string_var} "${local_string}" PARENT_SCOPE)
+endfunction()
+
+################################################################################
+
+if (NOT LOGFILE)
+  message(FATAL_ERROR "Missing -DLOGFILE=<ctest output> argument.")
+endif()
+
+# Check if logfile exists
+if (NOT EXISTS "${LOGFILE}")
+  message(FATAL_ERROR "LOGFILE does not exist ('${LOGFILE}').")
+endif()
+
+string(JOIN "" regex
+  "^[ ]*[0-9]+/[0-9]+[ ]+Test[ ]+#"
+  "([0-9]+)"                          # Test ID
+  ":[ ]+"
+  "(.+)"                              # Test Name
+  "[ ]+\\.+[ ]+"
+  "(.+[^ ])"                              # Result
+  "[ ]+"
+  "([0-9]+)"                          # Seconds
+  "\\.[0-9]+[ ]+sec[ ]*$"
+)
+
+message(DEBUG "Regex: ${regex}")
+
+# Read the logfile and generate a map / keylist
+set(keys)
+file(STRINGS "${LOGFILE}" lines)
+foreach(line ${lines})
+
+  # Parse each build time
+  string(REGEX MATCH "${regex}" _DUMMY "${line}")
+
+  if (CMAKE_MATCH_COUNT EQUAL 4)
+    set(test_id      "${CMAKE_MATCH_1}")
+    set(test_name    "${CMAKE_MATCH_2}")
+    set(test_result  "${CMAKE_MATCH_3}")
+    set(tmp          "${CMAKE_MATCH_4}") # floor(runtime_seconds)
+
+    # Compute human readable time
+    math(EXPR days         "${tmp} / (60 * 60 * 24)")
+    math(EXPR tmp          "${tmp} - (${days} * 60 * 60 * 24)")
+    math(EXPR hours        "${tmp} / (60 * 60)")
+    math(EXPR tmp          "${tmp} - (${hours} * 60 * 60)")
+    math(EXPR minutes      "${tmp} / (60)")
+    math(EXPR tmp          "${tmp} - (${minutes} * 60)")
+    math(EXPR seconds      "${tmp}")
+
+    # Format time components
+    pad_string_with_zeros(days 3)
+    pad_string_with_zeros(hours 2)
+    pad_string_with_zeros(minutes 2)
+    pad_string_with_zeros(seconds 2)
+
+    # Construct table entry
+    # Later values in the file for the same command overwrite earlier entries
+    string(MAKE_C_IDENTIFIER "${test_id}" key)
+    string(JOIN " | " ENTRY_${key}
+      "${days}d ${hours}h ${minutes}m ${seconds}s"
+      "${test_result}"
+      "${test_id}: ${test_name}"
+    )
+
+    # Record the key:
+    list(APPEND keys "${key}")
+  endif()
+endforeach()
+
+list(REMOVE_DUPLICATES keys)
+
+# Build the entry list:
+set(entries)
+foreach(key ${keys})
+  list(APPEND entries "${ENTRY_${key}}")
+endforeach()
+
+if (NOT entries)
+  message(FATAL_ERROR "LOGFILE contained no test times ('${LOGFILE}').")
+endif()
+
+# Sort in descending order:
+list(SORT entries ORDER DESCENDING)
+
+# Dump table:
+foreach(entry ${entries})
+  message(STATUS ${entry})
+endforeach()
diff --git a/cmake/PrintNinjaBuildTimes.cmake b/cmake/PrintNinjaBuildTimes.cmake
new file mode 100644
index 000000000..65d243d35
--- /dev/null
+++ b/cmake/PrintNinjaBuildTimes.cmake
@@ -0,0 +1,101 @@
+## This CMake script parses a .ninja_log file (LOGFILE) and prints a list of
+## build/link times, sorted longest first.
+##
+## cmake -DLOGFILE=<.ninja_log file> \
+##       -P PrintNinjaBuildTimes.cmake
+##
+## If LOGFILE is omitted, the current directory's .ninja_log file is used.
+################################################################################
+
+cmake_minimum_required(VERSION 3.15)
+
+# Prepend the string with "0" until the string length equals the specified width
+function(pad_string_with_zeros string_var width)
+  set(local_string "${${string_var}}")
+  string(LENGTH "${local_string}" size)
+  while(size LESS width)
+    string(PREPEND local_string "0")
+    string(LENGTH "${local_string}" size)
+  endwhile()
+  set(${string_var} "${local_string}" PARENT_SCOPE)
+endfunction()
+
+################################################################################
+
+if (NOT LOGFILE)
+  set(LOGFILE ".ninja_log")
+endif()
+
+# Check if logfile exists
+if (NOT EXISTS "${LOGFILE}")
+  message(FATAL_ERROR "LOGFILE does not exist ('${LOGFILE}').")
+endif()
+
+# Read the logfile and generate a map / keylist
+set(keys)
+file(STRINGS "${LOGFILE}" lines)
+foreach(line ${lines})
+
+  # Parse each build time
+  string(REGEX MATCH
+    "^([0-9]+)\t([0-9]+)\t[0-9]+\t([^\t]+)+\t[0-9a-fA-F]+$" _DUMMY "${line}")
+
+  if (CMAKE_MATCH_COUNT EQUAL 3)
+    set(start_ms ${CMAKE_MATCH_1})
+    set(end_ms ${CMAKE_MATCH_2})
+    set(command "${CMAKE_MATCH_3}")
+    math(EXPR runtime_ms "${end_ms} - ${start_ms}")
+
+    # Compute human readable time
+    math(EXPR days         "${runtime_ms} / (1000 * 60 * 60 * 24)")
+    math(EXPR runtime_ms   "${runtime_ms} - (${days} * 1000 * 60 * 60 * 24)")
+    math(EXPR hours        "${runtime_ms} / (1000 * 60 * 60)")
+    math(EXPR runtime_ms   "${runtime_ms} - (${hours} * 1000 * 60 * 60)")
+    math(EXPR minutes      "${runtime_ms} / (1000 * 60)")
+    math(EXPR runtime_ms   "${runtime_ms} - (${minutes} * 1000 * 60)")
+    math(EXPR seconds      "${runtime_ms} / 1000")
+    math(EXPR milliseconds "${runtime_ms} - (${seconds} * 1000)")
+
+    # Format time components
+    pad_string_with_zeros(days 3)
+    pad_string_with_zeros(hours 2)
+    pad_string_with_zeros(minutes 2)
+    pad_string_with_zeros(seconds 2)
+    pad_string_with_zeros(milliseconds 3)
+
+    # Construct table entry
+    # Later values in the file for the same command overwrite earlier entries
+    string(MAKE_C_IDENTIFIER "${command}" key)
+    set(ENTRY_${key}
+      "${days}d ${hours}h ${minutes}m ${seconds}s ${milliseconds}ms | ${command}"
+    )
+
+    # Record the key:
+    list(APPEND keys "${key}")
+  endif()
+endforeach()
+
+list(REMOVE_DUPLICATES keys)
+
+# Build the entry list:
+set(entries)
+foreach(key ${keys})
+  list(APPEND entries "${ENTRY_${key}}")
+endforeach()
+
+if (NOT entries)
+  message(FATAL_ERROR "LOGFILE contained no build entries ('${LOGFILE}').")
+endif()
+
+# Sort in descending order:
+list(SORT entries)
+list(REVERSE entries)
+
+# Dump table:
+message(STATUS "-----------------------+----------------------------")
+message(STATUS "Time                   | Command                    ")
+message(STATUS "-----------------------+----------------------------")
+
+foreach(entry ${entries})
+  message(STATUS ${entry})
+endforeach()
diff --git a/cmake/ThrustAddSubdir.cmake b/cmake/ThrustAddSubdir.cmake
new file mode 100644
index 000000000..d48aa1415
--- /dev/null
+++ b/cmake/ThrustAddSubdir.cmake
@@ -0,0 +1,6 @@
+find_package(Thrust REQUIRED CONFIG
+  NO_DEFAULT_PATH # Only check the explicit path in HINTS:
+  HINTS "${CMAKE_CURRENT_LIST_DIR}/.."
+  COMPONENTS ${THRUST_REQUIRED_SYSTEMS}
+  OPTIONAL_COMPONENTS ${THRUST_OPTIONAL_SYSTEMS}
+)
diff --git a/cmake/ThrustBuildCompilerTargets.cmake b/cmake/ThrustBuildCompilerTargets.cmake
new file mode 100644
index 000000000..aed0ec170
--- /dev/null
+++ b/cmake/ThrustBuildCompilerTargets.cmake
@@ -0,0 +1,191 @@
+#
+# This file defines the `thrust_build_compiler_targets()` function, which
+# creates the following interface targets:
+#
+# thrust.compiler_interface
+# - Interface target providing compiler-specific options needed to build
+#   Thrust's tests, examples, etc.
+#
+# thrust.compiler_interface_cppXX
+# - Interface targets providing compiler-specific options that should only be
+#   applied to certain dialects of C++. May not be defined for all dialects.
+#
+# thrust.promote_cudafe_warnings
+# - Interface target that adds warning promotion for NVCC cudafe invocations.
+# - Only exists to work around github issue #1174 on tbb.cuda configurations.
+# - May be combined with thrust.compiler_interface when #1174 is fully resolved.
+#
+# thrust.silence_unreachable_code_warnings
+# - Interface target that silences unreachable code warnings.
+# - Used to selectively disable such warnings in unit tests caused by
+#   unconditionally thrown exceptions.
+
+function(thrust_build_compiler_targets)
+  set(cxx_compile_definitions)
+  set(cxx_compile_options)
+
+  thrust_update_system_found_flags()
+
+  if ("MSVC" STREQUAL "${CMAKE_CXX_COMPILER_ID}")
+    append_option_if_available("/W4" cxx_compile_options)
+
+    # Treat all warnings as errors. This is only supported on Release builds,
+    # as `nv_exec_check_disable` doesn't seem to work with MSVC debug iterators
+    # and spurious warnings are emitted.
+    # See NVIDIA/thrust#1273, NVBug 3129879.
+    if (CMAKE_BUILD_TYPE STREQUAL "Release")
+      append_option_if_available("/WX" cxx_compile_options)
+    endif()
+
+    # Suppress overly-pedantic/unavoidable warnings brought in with /W4:
+    # C4324: structure was padded due to alignment specifier
+    append_option_if_available("/wd4324" cxx_compile_options)
+    # C4505: unreferenced local function has been removed
+    # The CUDA `host_runtime.h` header emits this for
+    # `__cudaUnregisterBinaryUtil`.
+    append_option_if_available("/wd4505" cxx_compile_options)
+    # C4706: assignment within conditional expression
+    # MSVC doesn't provide an opt-out for this warning when the assignment is
+    # intentional. Clang will warn for these, but suppresses the warning when
+    # double-parentheses are used around the assignment. We'll let Clang catch
+    # unintentional assignments and suppress all such warnings on MSVC.
+    append_option_if_available("/wd4706" cxx_compile_options)
+
+    # Disabled loss-of-data conversion warnings.
+    # TODO Re-enable.
+    append_option_if_available("/wd4244" cxx_compile_options)
+
+    # Disable warning about applying unary operator- to unsigned type.
+    # TODO Re-enable.
+    append_option_if_available("/wd4146" cxx_compile_options)
+
+    # MSVC STL assumes that `allocator_traits`'s allocator will use raw pointers,
+    # and the `__DECLSPEC_ALLOCATOR` macro causes issues with thrust's universal
+    # allocators:
+    #   warning C4494: 'std::allocator_traits<_Alloc>::allocate' :
+    #      Ignoring __declspec(allocator) because the function return type is not
+    #      a pointer or reference
+    # See https://github.com/microsoft/STL/issues/696
+    append_option_if_available("/wd4494" cxx_compile_options)
+
+    # Some of the async tests require /bigobj to fit all their sections into the
+    # object files:
+    append_option_if_available("/bigobj" cxx_compile_options)
+
+    # "Oh right, this is Visual Studio."
+    list(APPEND cxx_compile_definitions "NOMINMAX")
+  else()
+    append_option_if_available("-Werror" cxx_compile_options)
+    append_option_if_available("-Wall" cxx_compile_options)
+    append_option_if_available("-Wextra" cxx_compile_options)
+    append_option_if_available("-Winit-self" cxx_compile_options)
+    append_option_if_available("-Woverloaded-virtual" cxx_compile_options)
+    append_option_if_available("-Wcast-qual" cxx_compile_options)
+    append_option_if_available("-Wpointer-arith" cxx_compile_options)
+    append_option_if_available("-Wunused-local-typedef" cxx_compile_options)
+    append_option_if_available("-Wvla" cxx_compile_options)
+
+    # Disable GNU extensions (flag is clang only)
+    append_option_if_available("-Wgnu" cxx_compile_options)
+    # Calling a variadic macro with zero args is a GNU extension until C++20,
+    # but the THRUST_PP_ARITY macro is used with zero args. Need to see if this
+    # is a real problem worth fixing.
+    append_option_if_available("-Wno-gnu-zero-variadic-macro-arguments" cxx_compile_options)
+
+    # This complains about functions in CUDA system headers when used with nvcc.
+    append_option_if_available("-Wno-unused-function" cxx_compile_options)
+  endif()
+
+  if ("GNU" STREQUAL "${CMAKE_CXX_COMPILER_ID}")
+    if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 7.3)
+      # GCC 7.3 complains about name mangling changes due to `noexcept`
+      # becoming part of the type system; we don't care.
+      append_option_if_available("-Wno-noexcept-type" cxx_compile_options)
+    endif()
+  endif()
+
+  if ("Intel" STREQUAL "${CMAKE_CXX_COMPILER_ID}")
+    # Disable warning that inlining is inhibited by compiler thresholds.
+    append_option_if_available("-diag-disable=11074" cxx_compile_options)
+    append_option_if_available("-diag-disable=11076" cxx_compile_options)
+  endif()
+
+  if ("NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}")
+    # Today:
+    # * NVCC accepts CUDA C++ in .cu files but not .cpp files.
+    # * NVC++ accepts CUDA C++ in .cpp files but not .cu files.
+    # TODO: This won't be necessary in the future.
+    list(APPEND cxx_compile_options -cppsuffix=cu)
+  endif()
+
+  add_library(thrust.compiler_interface INTERFACE)
+
+  foreach (cxx_option IN LISTS cxx_compile_options)
+    target_compile_options(thrust.compiler_interface INTERFACE
+      $<$<COMPILE_LANGUAGE:CXX>:${cxx_option}>
+      $<$<AND:$<COMPILE_LANGUAGE:CUDA>,$<CUDA_COMPILER_ID:NVCXX>>:${cxx_option}>
+      # Only use -Xcompiler with NVCC, not NVC++.
+      #
+      # CMake can't split genexs, so this can't be formatted better :(
+      # This is:
+      # if (using CUDA and CUDA_COMPILER is NVCC) add -Xcompiler=opt:
+      $<$<AND:$<COMPILE_LANGUAGE:CUDA>,$<CUDA_COMPILER_ID:NVIDIA>>:-Xcompiler=${cxx_option}>
+    )
+  endforeach()
+
+  foreach (cxx_definition IN LISTS cxx_compile_definitions)
+    # Add these for both CUDA and CXX targets:
+    target_compile_definitions(thrust.compiler_interface INTERFACE
+      ${cxx_definition}
+    )
+  endforeach()
+
+  # Display warning numbers from nvcc cudafe errors:
+  target_compile_options(thrust.compiler_interface INTERFACE
+    # If using CUDA w/ NVCC...
+    $<$<AND:$<COMPILE_LANGUAGE:CUDA>,$<CUDA_COMPILER_ID:NVIDIA>>:-Xcudafe=--display_error_number>
+  )
+
+  # Tell NVCC to be quiet about deprecated GPU targets:
+  target_compile_options(thrust.compiler_interface INTERFACE
+    # If using CUDA w/ NVCC...
+    $<$<AND:$<COMPILE_LANGUAGE:CUDA>,$<CUDA_COMPILER_ID:NVIDIA>>:-Wno-deprecated-gpu-targets>
+  )
+
+  # This is kept separate for Github issue #1174.
+  add_library(thrust.promote_cudafe_warnings INTERFACE)
+  target_compile_options(thrust.promote_cudafe_warnings INTERFACE
+    $<$<AND:$<COMPILE_LANGUAGE:CUDA>,$<CUDA_COMPILER_ID:NVIDIA>>:-Xcudafe=--promote_warnings>
+  )
+
+  # Some of our unit tests unconditionally throw exceptions, and compilers will
+  # detect that the following instructions are unreachable. This is intentional
+  # and unavoidable in these cases. This target can be used to silence
+  # unreachable code warnings.
+  add_library(thrust.silence_unreachable_code_warnings INTERFACE)
+  if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
+    target_compile_options(thrust.silence_unreachable_code_warnings INTERFACE
+      $<$<COMPILE_LANGUAGE:CXX>:/wd4702>
+      $<$<AND:$<COMPILE_LANGUAGE:CUDA>,$<CUDA_COMPILER_ID:NVIDIA>>:-Xcompiler=/wd4702>
+    )
+  endif()
+
+  # These targets are used for dialect-specific options:
+  add_library(thrust.compiler_interface_cpp11 INTERFACE)
+  add_library(thrust.compiler_interface_cpp14 INTERFACE)
+
+  if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
+    # C4127: conditional expression is constant
+    # Disable this MSVC warning for C++11/C++14. In C++17, we can use
+    # THRUST_IF_CONSTEXPR to address these warnings.
+    target_compile_options(thrust.compiler_interface_cpp11 INTERFACE
+      $<$<COMPILE_LANGUAGE:CXX>:/wd4127>
+      $<$<AND:$<COMPILE_LANGUAGE:CUDA>,$<CUDA_COMPILER_ID:NVIDIA>>:-Xcompiler=/wd4127>
+    )
+    target_compile_options(thrust.compiler_interface_cpp14 INTERFACE
+      $<$<COMPILE_LANGUAGE:CXX>:/wd4127>
+      $<$<AND:$<COMPILE_LANGUAGE:CUDA>,$<CUDA_COMPILER_ID:NVIDIA>>:-Xcompiler=/wd4127>
+    )
+  endif()
+
+endfunction()
diff --git a/cmake/ThrustBuildTargetList.cmake b/cmake/ThrustBuildTargetList.cmake
new file mode 100644
index 000000000..f4adaf546
--- /dev/null
+++ b/cmake/ThrustBuildTargetList.cmake
@@ -0,0 +1,339 @@
+# This file provides utilities for building and working with thrust
+# configuration targets.
+#
+# THRUST_TARGETS
+#  - Built by the calling the `thrust_build_target_list()` function.
+#  - Each item is the name of a thrust interface target that is configured for a
+#    certain combination of host/device/dialect.
+#
+# thrust_build_target_list()
+# - Creates the THRUST_TARGETS list.
+#
+# The following functions can be used to test/set metadata on a thrust target:
+#
+# thrust_get_target_property(<prop_var> <target_name> <prop>)
+#   - Checks the ${prop} target property on thrust target ${target_name}
+#     and sets the ${prop_var} variable in the caller's scope.
+#   - <prop_var> is any valid cmake identifier.
+#   - <target_name> is the name of a thrust target.
+#   - <prop> is one of the following:
+#     - HOST: The host system. Valid values: CPP, OMP, TBB.
+#     - DEVICE: The device system. Valid values: CUDA, CPP, OMP, TBB.
+#     - DIALECT: The C++ dialect. Valid values: 11, 14, 17, 20.
+#     - PREFIX: A unique prefix that should be used to name all
+#       targets/tests/examples that use this configuration.
+#
+# thrust_get_target_properties(<target_name>)
+#   - Defines ${target_name}_${prop} in the caller's scope, for `prop` in:
+#     HOST, DEVICE, DIALECT, PREFIX. See above for details.
+#
+# thrust_clone_target_properties(<dst_target> <src_target>)
+#   - Set the HOST, DEVICE, DIALECT, PREFIX metadata on ${dst_target} to match
+#     ${src_target}. See above for details.
+#   - This *MUST* be called on any targets that link to another thrust target
+#     to ensure that dialect information is updated correctly, e.g.
+#     `thrust_clone_target_properties(${my_thrust_test} ${some_thrust_target})`
+
+define_property(TARGET PROPERTY _THRUST_HOST
+  BRIEF_DOCS "A target's host system: CPP, TBB, or OMP."
+  FULL_DOCS "A target's host system: CPP, TBB, or OMP."
+)
+define_property(TARGET PROPERTY _THRUST_DEVICE
+  BRIEF_DOCS "A target's device system: CUDA, CPP, TBB, or OMP."
+  FULL_DOCS "A target's device system: CUDA, CPP, TBB, or OMP."
+)
+define_property(TARGET PROPERTY _THRUST_DIALECT
+  BRIEF_DOCS "A target's C++ dialect: 11, 14, or 17."
+  FULL_DOCS "A target's C++ dialect: 11, 14, or 17."
+)
+define_property(TARGET PROPERTY _THRUST_PREFIX
+  BRIEF_DOCS "A prefix describing the config, eg. 'thrust.cpp.cuda.cpp14'."
+  FULL_DOCS "A prefix describing the config, eg. 'thrust.cpp.cuda.cpp14'."
+)
+
+function(thrust_set_target_properties target_name host device dialect prefix)
+  set_target_properties(${target_name}
+    PROPERTIES
+      _THRUST_HOST ${host}
+      _THRUST_DEVICE ${device}
+      _THRUST_DIALECT ${dialect}
+      _THRUST_PREFIX ${prefix}
+  )
+
+  get_property(langs GLOBAL PROPERTY ENABLED_LANGUAGES)
+  set(standard_features)
+  if (CUDA IN_LIST langs)
+    list(APPEND standard_features cuda_std_${dialect})
+  endif()
+  if (CXX IN_LIST langs)
+    list(APPEND standard_features cxx_std_${dialect})
+  endif()
+
+  get_target_property(type ${target_name} TYPE)
+  if (${type} STREQUAL "INTERFACE_LIBRARY")
+    target_compile_features(${target_name} INTERFACE
+      ${standard_features}
+    )
+  else()
+    target_compile_features(${target_name} PUBLIC
+      ${standard_features}
+    )
+    set_target_properties(${target_name}
+      PROPERTIES
+        CXX_STANDARD ${dialect}
+        CUDA_STANDARD ${dialect}
+        # Must manually request that the standards above are actually respected
+        # or else CMake will silently fail to configure the targets correctly...
+        # Note that this doesn't actually work as of CMake 3.16:
+        # https://gitlab.kitware.com/cmake/cmake/-/issues/20953
+        # We'll leave these properties enabled in hopes that they will someday
+        # work.
+        CXX_STANDARD_REQUIRED ON
+        CUDA_STANDARD_REQUIRED ON
+        ARCHIVE_OUTPUT_DIRECTORY "${THRUST_LIBRARY_OUTPUT_DIR}"
+        LIBRARY_OUTPUT_DIRECTORY "${THRUST_LIBRARY_OUTPUT_DIR}"
+        RUNTIME_OUTPUT_DIRECTORY "${THRUST_EXECUTABLE_OUTPUT_DIR}"
+    )
+
+    # CMake still emits errors about empty CUDA_ARCHITECTURES when CMP0104
+    # is set to OLD. This suppresses the errors for good.
+    if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.18)
+      set_target_properties(${target_name}
+        PROPERTIES
+          CUDA_ARCHITECTURES OFF
+      )
+    endif()
+
+    if ("CUDA" STREQUAL "${device}" AND
+        "NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}")
+      set_target_properties(${target_name} PROPERTIES
+        CUDA_RESOLVE_DEVICE_SYMBOLS OFF
+      )
+    endif()
+  endif()
+endfunction()
+
+# Get a thrust property from a target and store it in var_name
+# thrust_get_target_property(<var_name> <target_name> [HOST|DEVICE|DIALECT|PREFIX]
+macro(thrust_get_target_property prop_var target_name prop)
+  get_property(${prop_var} TARGET ${target_name} PROPERTY _THRUST_${prop})
+endmacro()
+
+# Defines the following string variables in the caller's scope:
+# - ${target_name}_HOST
+# - ${target_name}_DEVICE
+# - ${target_name}_DIALECT
+# - ${target_name}_PREFIX
+macro(thrust_get_target_properties target_name)
+  thrust_get_target_property(${target_name}_HOST ${target_name} HOST)
+  thrust_get_target_property(${target_name}_DEVICE ${target_name} DEVICE)
+  thrust_get_target_property(${target_name}_DIALECT ${target_name} DIALECT)
+  thrust_get_target_property(${target_name}_PREFIX ${target_name} PREFIX)
+endmacro()
+
+# Set one target's THRUST_* properties to match another target
+function(thrust_clone_target_properties dst_target src_target)
+  thrust_get_target_properties(${src_target})
+  thrust_set_target_properties(${dst_target}
+    ${${src_target}_HOST}
+    ${${src_target}_DEVICE}
+    ${${src_target}_DIALECT}
+    ${${src_target}_PREFIX}
+  )
+endfunction()
+
+# Set ${var_name} to TRUE or FALSE in the caller's scope
+function(_thrust_is_config_valid var_name host device dialect)
+  if (THRUST_MULTICONFIG_ENABLE_SYSTEM_${host} AND
+      THRUST_MULTICONFIG_ENABLE_SYSTEM_${device} AND
+      THRUST_MULTICONFIG_ENABLE_DIALECT_CPP${dialect} AND
+      "${host}_${device}" IN_LIST THRUST_MULTICONFIG_WORKLOAD_${THRUST_MULTICONFIG_WORKLOAD}_CONFIGS)
+    set(${var_name} TRUE PARENT_SCOPE)
+  else()
+    set(${var_name} FALSE PARENT_SCOPE)
+  endif()
+endfunction()
+
+function(_thrust_init_target_list)
+  set(THRUST_TARGETS "" CACHE INTERNAL "" FORCE)
+endfunction()
+
+function(_thrust_add_target_to_target_list target_name host device dialect prefix)
+  thrust_set_target_properties(${target_name} ${host} ${device} ${dialect} ${prefix})
+
+  target_link_libraries(${target_name} INTERFACE
+    thrust.compiler_interface
+  )
+
+  # dialect-specific interface:
+  if (TARGET thrust.compiler_interface_cpp${dialect})
+    target_link_libraries(${target_name} INTERFACE
+      thrust.compiler_interface_cpp${dialect}
+    )
+  endif()
+
+  # Workaround Github issue #1174. cudafe promote TBB header warnings to
+  # errors, even when they're -isystem includes.
+  if ((NOT host STREQUAL "TBB") OR (NOT device STREQUAL "CUDA"))
+    target_link_libraries(${target_name} INTERFACE
+      thrust.promote_cudafe_warnings
+    )
+  endif()
+
+  set(THRUST_TARGETS ${THRUST_TARGETS} ${target_name} CACHE INTERNAL "" FORCE)
+
+  set(label "${host}.${device}.cpp${dialect}")
+  string(TOLOWER "${label}" label)
+  message(STATUS "Enabling Thrust configuration: ${label}")
+endfunction()
+
+function(_thrust_build_target_list_multiconfig)
+  # Detect supported dialects if requested -- this must happen after CUDA is
+  # enabled, if it's going to be enabled.
+  if (THRUST_MULTICONFIG_ENABLE_DIALECT_ALL OR
+      THRUST_MULTICONFIG_ENABLE_DIALECT_LATEST)
+    message(STATUS "Testing for supported language standards...")
+    include("${CMAKE_CURRENT_FUNCTION_LIST_DIR}/DetectSupportedStandards.cmake")
+    detect_supported_standards(THRUST CXX ${THRUST_CPP_DIALECT_OPTIONS})
+    if (THRUST_CUDA_FOUND)
+      detect_supported_standards(THRUST CUDA ${THRUST_CPP_DIALECT_OPTIONS})
+    endif()
+
+    # Take the union of supported standards in CXX and CUDA:
+    set(supported_dialects)
+    set(latest_dialect 11)
+    foreach(standard IN LISTS THRUST_CPP_DIALECT_OPTIONS)
+      if ((THRUST_CXX_${standard}_SUPPORTED) AND
+          ((NOT THRUST_CUDA_FOUND) OR THRUST_CUDA_${standard}_SUPPORTED))
+
+        # MSVC silently promotes C++11 to C++14 -- skip it:
+        if ((${CMAKE_CXX_COMPILER_ID} STREQUAL MSVC) AND (standard EQUAL 11))
+          continue()
+        endif()
+
+        list(APPEND supported_dialects ${standard})
+        if (latest_dialect LESS standard)
+          set(latest_dialect ${standard})
+        endif()
+      endif()
+    endforeach()
+
+    if (THRUST_MULTICONFIG_ENABLE_DIALECT_ALL)
+      foreach(standard IN LISTS THRUST_CPP_DIALECT_OPTIONS)
+        if (standard IN_LIST supported_dialects)
+          set(THRUST_MULTICONFIG_ENABLE_DIALECT_CPP${standard} ON CACHE BOOL
+              "Generate C++${dialect} build configurations." FORCE
+          )
+        else()
+          set(THRUST_MULTICONFIG_ENABLE_DIALECT_CPP${standard} OFF CACHE BOOL
+            "Generate C++${dialect} build configurations." FORCE
+            )
+        endif()
+      endforeach()
+    elseif(THRUST_MULTICONFIG_ENABLE_DIALECT_LATEST)
+      foreach(standard IN LISTS THRUST_CPP_DIALECT_OPTIONS)
+        if (standard EQUAL latest_dialect)
+          set(THRUST_MULTICONFIG_ENABLE_DIALECT_CPP${standard} ON CACHE BOOL
+            "Generate C++${dialect} build configurations." FORCE
+            )
+        else()
+          set(THRUST_MULTICONFIG_ENABLE_DIALECT_CPP${standard} OFF CACHE BOOL
+            "Generate C++${dialect} build configurations." FORCE
+            )
+        endif()
+      endforeach()
+    endif()
+  endif()
+
+  # Supported versions of MSVC do not distinguish between C++11 and C++14.
+  # Warn the user that they may be generating a ton of redundant targets if
+  # they explicitly requested this configuration.
+  if ("MSVC" STREQUAL "${CMAKE_CXX_COMPILER_ID}" AND
+      THRUST_MULTICONFIG_ENABLE_DIALECT_CPP11)
+    message(WARNING
+      "Supported versions of MSVC (2017+) do not distinguish between C++11 "
+      "and C++14. The requested C++11 targets may be redundant."
+    )
+  endif()
+
+  # Build THRUST_TARGETS
+  foreach(host IN LISTS THRUST_HOST_SYSTEM_OPTIONS)
+    foreach(device IN LISTS THRUST_DEVICE_SYSTEM_OPTIONS)
+      foreach(dialect IN LISTS THRUST_CPP_DIALECT_OPTIONS)
+        _thrust_is_config_valid(config_valid ${host} ${device} ${dialect})
+        if (config_valid)
+          set(prefix "thrust.${host}.${device}.cpp${dialect}")
+          string(TOLOWER "${prefix}" prefix)
+
+          # Configure a thrust interface target for this host/device
+          set(target_name "${prefix}")
+          thrust_create_target(${target_name}
+            HOST ${host}
+            DEVICE ${device}
+            ${THRUST_TARGET_FLAGS}
+          )
+
+          # Set configuration metadata for this thrust interface target:
+          _thrust_add_target_to_target_list(${target_name}
+            ${host} ${device} ${dialect} ${prefix}
+          )
+
+          # Create a meta target for all targets in this configuration:
+          add_custom_target(${prefix}.all)
+          add_dependencies(thrust.all ${prefix}.all)
+        endif()
+      endforeach() # dialects
+    endforeach() # devices
+  endforeach() # hosts
+
+  list(LENGTH THRUST_TARGETS count)
+  message(STATUS "${count} unique thrust.host.device.dialect configurations generated")
+endfunction()
+
+function(_thrust_build_target_list_singleconfig)
+  set(host ${THRUST_HOST_SYSTEM})
+  set(device ${THRUST_DEVICE_SYSTEM})
+  set(dialect ${THRUST_CPP_DIALECT})
+  set(prefix "thrust") # single config
+
+  _thrust_add_target_to_target_list(thrust ${host} ${device} ${dialect} ${prefix})
+endfunction()
+
+# Build a ${THRUST_TARGETS} list containing target names for all
+# requested configurations
+function(thrust_build_target_list)
+  # Clear the list of targets:
+  _thrust_init_target_list()
+
+  # Generic config flags:
+  set(THRUST_TARGET_FLAGS)
+  macro(add_flag_option flag docstring default)
+    set(opt "THRUST_${flag}")
+    option(${opt} "${docstring}" "${default}")
+    mark_as_advanced(${opt})
+    if (${${opt}})
+      list(APPEND THRUST_TARGET_FLAGS ${flag})
+    endif()
+  endmacro()
+  add_flag_option(IGNORE_DEPRECATED_CPP_DIALECT "Don't warn about any deprecated C++ standards and compilers." OFF)
+  add_flag_option(IGNORE_DEPRECATED_CPP_11 "Don't warn about deprecated C++11." OFF)
+  add_flag_option(IGNORE_DEPRECATED_COMPILER "Don't warn about deprecated compilers." OFF)
+  add_flag_option(IGNORE_CUB_VERSION_CHECK "Don't warn about mismatched CUB versions." OFF)
+  add_flag_option(IGNORE_DEPRECATED_API "Don't warn about deprecated Thrust or CUB APIs." OFF)
+
+  # Top level meta-target. Makes it easier to just build thrust targets when
+  # building both CUB and Thrust. Add all project files here so IDEs will be
+  # aware of them. This will not generate build rules.
+  file(GLOB_RECURSE all_sources
+    RELATIVE "${CMAKE_CURRENT_LIST_DIR}"
+    "${Thrust_SOURCE_DIR}/thrust/*.h"
+    "${Thrust_SOURCE_DIR}/thrust/*.inl"
+  )
+  add_custom_target(thrust.all SOURCES ${all_sources})
+
+  if (THRUST_ENABLE_MULTICONFIG)
+    _thrust_build_target_list_multiconfig()
+  else()
+    _thrust_build_target_list_singleconfig()
+  endif()
+endfunction()
diff --git a/cmake/ThrustCompilerHacks.cmake b/cmake/ThrustCompilerHacks.cmake
new file mode 100644
index 000000000..5f7b0d98e
--- /dev/null
+++ b/cmake/ThrustCompilerHacks.cmake
@@ -0,0 +1,110 @@
+# Set up compiler paths and apply temporary hacks to support NVC++.
+# This file must be included before enabling any languages.
+
+# Temporary hacks to make NVC++ work; this requires you to define
+# `CMAKE_CUDA_COMPILER_ID=NVCXX and `CMAKE_CUDA_COMPILER_FORCED=ON`.
+if ("NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}")
+  # If using NVC++, don't set CXX compiler
+  if (NOT "${CMAKE_CXX_COMPILER}" STREQUAL "")
+    unset(CMAKE_CXX_COMPILER CACHE)
+    message(FATAL_ERROR "You are using NVC++ as your CUDA C++ compiler, but have"
+      " specified a different ISO C++ compiler; NVC++ acts as both, so please"
+      " unset the CMAKE_CXX_COMPILER variable."
+    )
+  endif()
+
+  # We don't set CMAKE_CUDA_HOST_COMPILER for NVC++; if we do, CMake tries to
+  # pass `-ccbin ${CMAKE_CUDA_HOST_COMPILER}` to NVC++, which it doesn't
+  # understand.
+  if (NOT "${CMAKE_CUDA_HOST_COMPILER}" STREQUAL "")
+    unset(CMAKE_CUDA_HOST_COMPILER CACHE)
+    message(FATAL_ERROR "You are using NVC++ as your CUDA C++ compiler, but have"
+      " specified a different host ISO C++ compiler; NVC++ acts as both, so"
+      " please unset the CMAKE_CUDA_HOST_COMPILER variable."
+    )
+  endif()
+
+  set(CMAKE_CXX_COMPILER "${CMAKE_CUDA_COMPILER}")
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -stdpar")
+  set(CMAKE_CUDA_HOST_LINK_LAUNCHER "${CMAKE_CUDA_COMPILER}")
+  set(CMAKE_CUDA_LINK_EXECUTABLE
+    "<CMAKE_CUDA_HOST_LINK_LAUNCHER> <FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>")
+
+  # Setup CMAKE_CXX_LIBRARY_ARCHITECTURE on Debian/Ubuntu so that find_package
+  # works properly.
+  if (EXISTS /etc/debian_version)
+    if (NOT CMAKE_CXX_LIBRARY_ARCHITECTURE)
+      file(GLOB files_in_lib RELATIVE /lib /lib/*-linux-gnu* )
+      foreach (file ${files_in_lib})
+        if ("${file}" MATCHES "${CMAKE_LIBRARY_ARCHITECTURE_REGEX}")
+          set(CMAKE_CXX_LIBRARY_ARCHITECTURE ${file})
+          break()
+        endif()
+      endforeach()
+    endif()
+    if (NOT CMAKE_LIBRARY_ARCHITECTURE)
+      set(CMAKE_LIBRARY_ARCHITECTURE ${CMAKE_CXX_LIBRARY_ARCHITECTURE})
+    endif()
+  endif()
+endif()
+
+# We don't set CMAKE_CUDA_HOST_COMPILER for NVC++; if we do, CMake tries to
+# pass `-ccbin ${CMAKE_CUDA_HOST_COMPILER}` to NVC++, which it doesn't
+# understand.
+if ((NOT "NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}"))
+  if (NOT ("${CMAKE_CUDA_HOST_COMPILER}" STREQUAL "" OR
+    "${CMAKE_CUDA_HOST_COMPILER}" STREQUAL "${CMAKE_CXX_COMPILER}"))
+    set(tmp "${CMAKE_CUDA_HOST_COMPILER}")
+    unset(CMAKE_CUDA_HOST_COMPILER CACHE)
+    message(FATAL_ERROR
+      "For convenience, Thrust's test harness uses CMAKE_CXX_COMPILER for the "
+      "CUDA host compiler. Refusing to overwrite specified "
+      "CMAKE_CUDA_HOST_COMPILER -- please reconfigure without setting this "
+      "variable. Currently:\n"
+      "CMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}\n"
+      "CMAKE_CUDA_HOST_COMPILER=${tmp}"
+    )
+  endif ()
+  set(CMAKE_CUDA_HOST_COMPILER "${CMAKE_CXX_COMPILER}")
+endif ()
+
+# Temporary hacks to make NVC++ work; this requires you to define
+# `CMAKE_CUDA_COMPILER_ID=NVCXX and `CMAKE_CUDA_COMPILER_FORCED=ON`.
+if ("NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}")
+  # Need 3.17 for the properties used below.
+  cmake_minimum_required(VERSION 3.17)
+
+  set(CMAKE_CUDA_STANDARD_DEFAULT 03)
+
+  set(CMAKE_CUDA03_STANDARD_COMPILE_OPTION "-std=c++03")
+  set(CMAKE_CUDA03_EXTENSION_COMPILE_OPTION "-std=c++03")
+  set(CMAKE_CUDA03_STANDARD__HAS_FULL_SUPPORT TRUE)
+  set_property(GLOBAL PROPERTY CMAKE_CUDA03_KNOWN_FEATURES)
+
+  set(CMAKE_CUDA11_STANDARD_COMPILE_OPTION "-std=c++11")
+  set(CMAKE_CUDA11_EXTENSION_COMPILE_OPTION "-std=c++11")
+  set(CMAKE_CUDA11_STANDARD__HAS_FULL_SUPPORT TRUE)
+  set_property(GLOBAL PROPERTY CMAKE_CUDA11_KNOWN_FEATURES)
+
+  set(CMAKE_CUDA14_STANDARD_COMPILE_OPTION "-std=c++14")
+  set(CMAKE_CUDA14_EXTENSION_COMPILE_OPTION "-std=c++14")
+  set(CMAKE_CUDA14_STANDARD__HAS_FULL_SUPPORT TRUE)
+  set_property(GLOBAL PROPERTY CMAKE_CUDA14_KNOWN_FEATURES)
+
+  set(CMAKE_CUDA17_STANDARD_COMPILE_OPTION "-std=c++17")
+  set(CMAKE_CUDA17_EXTENSION_COMPILE_OPTION "-std=c++17")
+  set(CMAKE_CUDA17_STANDARD__HAS_FULL_SUPPORT TRUE)
+  set_property(GLOBAL PROPERTY CMAKE_CUDA17_KNOWN_FEATURES)
+
+  include(Internal/FeatureTesting)
+  include(Compiler/CMakeCommonCompilerMacros)
+  cmake_record_cuda_compile_features()
+
+  set(CMAKE_CUDA_COMPILE_FEATURES
+    ${CMAKE_CUDA03_COMPILE_FEATURES}
+    ${CMAKE_CUDA11_COMPILE_FEATURES}
+    ${CMAKE_CUDA14_COMPILE_FEATURES}
+    ${CMAKE_CUDA17_COMPILE_FEATURES}
+    ${CMAKE_CUDA20_COMPILE_FEATURES}
+  )
+endif()
diff --git a/cmake/ThrustCudaConfig.cmake b/cmake/ThrustCudaConfig.cmake
new file mode 100644
index 000000000..a585c7910
--- /dev/null
+++ b/cmake/ThrustCudaConfig.cmake
@@ -0,0 +1,200 @@
+enable_language(CUDA)
+
+set(THRUST_KNOWN_COMPUTE_ARCHS 50 52 53 60 61 62 70 72 75 80 86)
+
+if (NVIDIA STREQUAL "${CMAKE_CUDA_COMPILER_ID}")
+  if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER 11.7)
+    list(APPEND THRUST_KNOWN_COMPUTE_ARCHS 90)
+  endif()
+  if (CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 12.0)
+    list(APPEND THRUST_KNOWN_COMPUTE_ARCHS 35 37)
+  endif()
+else()
+  list(APPEND THRUST_KNOWN_COMPUTE_ARCHS 35 37 90)
+endif()
+
+# Split CUDA_FLAGS into 3 parts:
+#
+# THRUST_CUDA_FLAGS_BASE: Common CUDA flags for all targets.
+# THRUST_CUDA_FLAGS_RDC: Additional CUDA flags for targets compiled with RDC.
+# THRUST_CUDA_FLAGS_NO_RDC: Additional CUDA flags for targets compiled without RDC.
+#
+# This is necessary because CUDA SMs 5.3, 6.2, and 7.2 do not support RDC, but
+# we want to always build some targets (e.g. testing/cuda/*) with RDC.
+# We work around this by building the "always RDC" targets without support for
+# those SMs. This requires two sets of CUDA_FLAGS.
+#
+# Enabling any of those SMs along with the ENABLE_RDC options will result in a
+# configuration error.
+#
+# Because of how CMake handles the CMAKE_CUDA_FLAGS variables, every target
+# generated in a given directory will use the same value for CMAKE_CUDA_FLAGS,
+# which is determined at the end of the directory's scope. This means caution
+# should be used when trying to build different targets with different flags,
+# since they might not behave as expected. This will improve with CMake 3.18,
+# which add the DEVICE_LINK genex, fixing the issue with using per-target
+# CUDA_FLAGS: https://gitlab.kitware.com/cmake/cmake/-/issues/18265
+set(THRUST_CUDA_FLAGS_BASE "${CMAKE_CUDA_FLAGS}")
+set(THRUST_CUDA_FLAGS_RDC)
+set(THRUST_CUDA_FLAGS_NO_RDC)
+
+# Archs that don't support RDC:
+set(no_rdc_archs 53 62 72)
+
+# Find the highest arch:
+list(SORT THRUST_KNOWN_COMPUTE_ARCHS)
+list(LENGTH THRUST_KNOWN_COMPUTE_ARCHS max_idx)
+math(EXPR max_idx "${max_idx} - 1")
+list(GET THRUST_KNOWN_COMPUTE_ARCHS ${max_idx} highest_arch)
+
+option(THRUST_AUTO_DETECT_COMPUTE_ARCHS
+  "If ON, compute architectures for all GPUs in the current system are enabled and all other compute architectures are disabled."
+  OFF
+)
+
+if (THRUST_AUTO_DETECT_COMPUTE_ARCHS)
+  if ("NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}")
+    message(STATUS "Thrust: Using NVC++ builtin automatic compute architecture detection.")
+  else()
+    set(detect_compute_archs_source ${Thrust_SOURCE_DIR}/cmake/detect_compute_archs.cu)
+    set(detect_compute_archs_exe ${PROJECT_BINARY_DIR}/detect_compute_archs)
+    set(detect_compute_archs_error_log ${PROJECT_BINARY_DIR}/detect_compute_archs.stderr.log)
+    execute_process(
+      COMMAND ${CMAKE_CUDA_COMPILER}
+        -std=c++11
+        -o ${detect_compute_archs_exe}
+        --run
+        ${detect_compute_archs_source}
+      OUTPUT_VARIABLE detected_archs
+      OUTPUT_STRIP_TRAILING_WHITESPACE
+      ERROR_FILE ${detect_compute_archs_error_log})
+    if ("NONE" STREQUAL "${detected_archs}")
+      set(detected_message " none")
+    else()
+      foreach (arch IN LISTS detected_archs)
+        string(APPEND detected_message " sm_${arch}")
+      endforeach()
+    endif()
+    message(STATUS "Thrust: Automatically detected compute architectures:${detected_message}")
+  endif()
+endif()
+
+set(option_init OFF)
+if ("NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}")
+  set(option_init ON)
+endif()
+option(THRUST_DISABLE_ARCH_BY_DEFAULT
+  "If ON, then all compute architectures are disabled on the initial CMake run."
+  ${option_init}
+)
+
+set(option_init ON)
+if (THRUST_DISABLE_ARCH_BY_DEFAULT OR THRUST_AUTO_DETECT_COMPUTE_ARCHS)
+  set(option_init OFF)
+endif()
+
+set(num_archs_enabled 0)
+foreach (arch IN LISTS THRUST_KNOWN_COMPUTE_ARCHS)
+  set(this_option_init ${option_init})
+
+  if (${arch} IN_LIST detected_archs)
+    set(this_option_init ON)
+  endif()
+
+  option(THRUST_ENABLE_COMPUTE_${arch}
+    "Enable code generation for tests for sm_${arch}"
+    ${this_option_init}
+  )
+
+  if (NOT THRUST_ENABLE_COMPUTE_${arch})
+    continue()
+  endif()
+
+  math(EXPR num_archs_enabled "${num_archs_enabled} + 1")
+
+  if ("NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}")
+    if (NOT ${num_archs_enabled} EQUAL 1)
+      message(FATAL_ERROR
+        "NVCXX does not support compilation for multiple device architectures "
+        "at once."
+      )
+    endif()
+    set(arch_flag "-gpu=cc${arch}")
+  elseif ("Clang" STREQUAL "${CMAKE_CUDA_COMPILER_ID}")
+    set(arch_flag "--cuda-gpu-arch=sm_${arch}")
+  else()
+    set(arch_flag "-gencode arch=compute_${arch},code=sm_${arch}")
+  endif()
+
+  string(APPEND compute_message " sm_${arch}")
+  string(APPEND THRUST_CUDA_FLAGS_NO_RDC " ${arch_flag}")
+  if (NOT arch IN_LIST no_rdc_archs)
+    string(APPEND THRUST_CUDA_FLAGS_RDC " ${arch_flag}")
+  endif()
+endforeach()
+
+if (NOT "NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}")
+  option(THRUST_ENABLE_COMPUTE_FUTURE
+    "Enable code generation for tests for compute_${highest_arch}"
+    ${option_init}
+  )
+  if (THRUST_ENABLE_COMPUTE_FUTURE)
+    string(APPEND THRUST_CUDA_FLAGS_BASE
+      " -gencode arch=compute_${highest_arch},code=compute_${highest_arch}"
+    )
+    string(APPEND compute_message " compute_${highest_arch}")
+  endif()
+endif()
+
+message(STATUS "Thrust: Explicitly enabled compute architectures:${compute_message}")
+
+# RDC is off by default in NVCC and on by default in NVC++. Turning off RDC
+# isn't currently supported by NVC++. So, we default to RDC off for NVCC and
+# RDC on for NVC++.
+set(option_init OFF)
+if ("NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}")
+  set(option_init ON)
+endif()
+
+option(THRUST_ENABLE_TESTS_WITH_RDC
+  "Build all Thrust tests with RDC; tests that require RDC are not affected by this option."
+  ${option_init}
+)
+
+option(THRUST_ENABLE_EXAMPLES_WITH_RDC
+  "Build all Thrust examples with RDC; examples which require RDC are not affected by this option."
+  ${option_init}
+)
+
+# Check for RDC/SM compatibility and error/warn if necessary
+foreach (sm IN LISTS no_rdc_archs)
+  set(sm_opt THRUST_ENABLE_COMPUTE_${sm})
+  if (${sm_opt})
+    foreach (opt IN ITEMS TESTS EXAMPLES)
+      set(rdc_opt THRUST_ENABLE_${opt}_WITH_RDC)
+      if (${rdc_opt})
+        message(FATAL_ERROR
+          "${rdc_opt} is incompatible with ${sm_opt}, since sm_${sm} does not "
+          "support RDC."
+        )
+      endif()
+    endforeach()
+
+    message(NOTICE
+      "sm_${sm} does not support RDC. Targets that require RDC will be built "
+      "without support for this architecture."
+    )
+  endif()
+endforeach()
+
+
+# 
+# Clang CUDA options 
+#
+if ("Clang" STREQUAL "${CMAKE_CUDA_COMPILER_ID}")
+  set(THRUST_CUDA_FLAGS_BASE "${THRUST_CUDA_FLAGS_BASE} -Wno-unknown-cuda-version -Xclang=-fcuda-allow-variadic-functions")
+endif()
+
+
+# By default RDC is not used:
+set(CMAKE_CUDA_FLAGS "${THRUST_CUDA_FLAGS_BASE} ${THRUST_CUDA_FLAGS_NO_RDC}")
diff --git a/cmake/ThrustFindThrust.cmake b/cmake/ThrustFindThrust.cmake
new file mode 100644
index 000000000..39a79e4b7
--- /dev/null
+++ b/cmake/ThrustFindThrust.cmake
@@ -0,0 +1,42 @@
+function(_thrust_find_thrust_multiconfig)
+  # Check which systems are enabled by multiconfig:
+  set(req_systems)
+  if (THRUST_MULTICONFIG_ENABLE_SYSTEM_CUDA)
+    list(APPEND req_systems CUDA)
+  endif()
+  if (THRUST_MULTICONFIG_ENABLE_SYSTEM_CPP)
+    list(APPEND req_systems CPP)
+  endif()
+  if (THRUST_MULTICONFIG_ENABLE_SYSTEM_TBB)
+    list(APPEND req_systems TBB)
+  endif()
+  if (THRUST_MULTICONFIG_ENABLE_SYSTEM_OMP)
+    list(APPEND req_systems OMP)
+  endif()
+
+  find_package(Thrust REQUIRED CONFIG
+    NO_DEFAULT_PATH # Only check the explicit path in HINTS:
+    HINTS "${Thrust_SOURCE_DIR}"
+    COMPONENTS ${req_systems}
+  )
+endfunction()
+
+function(_thrust_find_thrust_singleconfig)
+  find_package(Thrust REQUIRED CONFIG
+    NO_DEFAULT_PATH # Only check the explicit path in HINTS:
+    HINTS "${Thrust_SOURCE_DIR}"
+  )
+  # Create target now to prepare system found flags:
+  thrust_create_target(thrust FROM_OPTIONS ${THRUST_TARGET_FLAGS})
+  thrust_debug_target(thrust "${THRUST_VERSION}")
+endfunction()
+
+# Build a ${THRUST_TARGETS} list containing target names for all
+# requested configurations
+function(thrust_find_thrust)
+  if (THRUST_ENABLE_MULTICONFIG)
+    _thrust_find_thrust_multiconfig()
+  else()
+    _thrust_find_thrust_singleconfig()
+  endif()
+endfunction()
diff --git a/cmake/ThrustHeaderTesting.cmake b/cmake/ThrustHeaderTesting.cmake
new file mode 100644
index 000000000..3b3e00ca8
--- /dev/null
+++ b/cmake/ThrustHeaderTesting.cmake
@@ -0,0 +1,140 @@
+# For every public header, build a translation unit containing `#include <header>`
+# to let the compiler try to figure out warnings in that header if it is not otherwise
+# included in tests, and also to verify if the headers are modular enough.
+# .inl files are not globbed for, because they are not supposed to be used as public
+# entrypoints.
+
+# Meta target for all configs' header builds:
+add_custom_target(thrust.all.headers)
+
+foreach(thrust_target IN LISTS THRUST_TARGETS)
+  thrust_get_target_property(config_host ${thrust_target} HOST)
+  thrust_get_target_property(config_device ${thrust_target} DEVICE)
+  thrust_get_target_property(config_prefix ${thrust_target} PREFIX)
+  set(config_systems ${config_host} ${config_device})
+
+  string(TOLOWER "${config_host}" host_lower)
+  string(TOLOWER "${config_device}" device_lower)
+
+  # GLOB ALL THE THINGS
+  set(headers_globs thrust/*.h)
+  set(headers_exclude_systems_globs thrust/system/*/*)
+  set(headers_systems_globs
+    thrust/system/${host_lower}/*
+    thrust/system/${device_lower}/*
+  )
+  set(headers_exclude_details_globs
+    thrust/detail/*
+    thrust/*/detail/*
+    thrust/*/*/detail/*
+  )
+
+  # Get all .h files...
+  file(GLOB_RECURSE headers
+    RELATIVE "${Thrust_SOURCE_DIR}/thrust"
+    CONFIGURE_DEPENDS
+    ${headers_globs}
+  )
+
+  # ...then remove all system specific headers...
+  file(GLOB_RECURSE headers_exclude_systems
+    RELATIVE "${Thrust_SOURCE_DIR}/thrust"
+    CONFIGURE_DEPENDS
+    ${headers_exclude_systems_globs}
+  )
+  list(REMOVE_ITEM headers ${headers_exclude_systems})
+
+  # ...then add all headers specific to the selected host and device systems back again...
+  file(GLOB_RECURSE headers_systems
+    RELATIVE ${Thrust_SOURCE_DIR}/thrust
+    CONFIGURE_DEPENDS
+    ${headers_systems_globs}
+  )
+  list(APPEND headers ${headers_systems})
+
+  # ...and remove all the detail headers (also removing the detail headers from the selected systems).
+  file(GLOB_RECURSE headers_exclude_details
+    RELATIVE "${Thrust_SOURCE_DIR}/thrust"
+    CONFIGURE_DEPENDS
+    ${headers_exclude_details_globs}
+  )
+  list(REMOVE_ITEM headers ${headers_exclude_details})
+
+  # List of headers that aren't implemented for all backends, but are implemented for CUDA.
+  set(partially_implemented_CUDA
+    async/copy.h
+    async/for_each.h
+    async/reduce.h
+    async/scan.h
+    async/sort.h
+    async/transform.h
+    event.h
+    future.h
+  )
+
+  # List of headers that aren't implemented for all backends, but are implemented for CPP.
+  set(partially_implemented_CPP
+  )
+
+  # List of headers that aren't implemented for all backends, but are implemented for TBB.
+  set(partially_implemented_TBB
+  )
+
+  # List of headers that aren't implemented for all backends, but are implemented for OMP.
+  set(partially_implemented_OMP
+  )
+
+  # List of all partially implemented headers.
+  set(partially_implemented
+    ${partially_implemented_CUDA}
+    ${partially_implemented_CPP}
+    ${partially_implemented_TBB}
+    ${partially_implemented_OMP}
+  )
+  list(REMOVE_DUPLICATES partially_implemented)
+
+  set(headertest_srcs)
+
+  foreach (header IN LISTS headers)
+    if ("${header}" IN_LIST partially_implemented)
+      # This header is partially implemented on _some_ backends...
+      if (NOT "${header}" IN_LIST partially_implemented_${config_device})
+        # ...but not on the selected one.
+        continue()
+      endif()
+    endif()
+
+    set(headertest_src_ext .cpp)
+    if ("CUDA" STREQUAL "${config_device}")
+      set(headertest_src_ext .cu)
+    endif()
+
+    set(headertest_src "headers/${config_prefix}/${header}${headertest_src_ext}")
+    configure_file("${Thrust_SOURCE_DIR}/cmake/header_test.in" "${headertest_src}")
+
+    list(APPEND headertest_srcs "${headertest_src}")
+  endforeach()
+
+  set(headertest_target ${config_prefix}.headers)
+  add_library(${headertest_target} OBJECT ${headertest_srcs})
+  target_link_libraries(${headertest_target} PUBLIC ${thrust_target})
+  # Wrap Thrust/CUB in a custom namespace to check proper use of ns macros:
+  target_compile_definitions(${headertest_target} PRIVATE
+    "THRUST_WRAPPED_NAMESPACE=wrapped_thrust"
+    "CUB_WRAPPED_NAMESPACE=wrapped_cub"
+  )
+  thrust_clone_target_properties(${headertest_target} ${thrust_target})
+
+  # Disable macro checks on TBB; the TBB atomic implementation uses `I` and
+  # our checks will issue false errors.
+  if ("TBB" IN_LIST config_systems)
+    target_compile_definitions(${headertest_target}
+      PRIVATE THRUST_IGNORE_MACRO_CHECKS
+    )
+  endif()
+
+  thrust_fix_clang_nvcc_build_for(${headertest_target})
+
+  add_dependencies(thrust.all.headers ${headertest_target})
+  add_dependencies(${config_prefix}.all ${headertest_target})
+endforeach()
diff --git a/cmake/ThrustInstallRules.cmake b/cmake/ThrustInstallRules.cmake
new file mode 100644
index 000000000..98e72e196
--- /dev/null
+++ b/cmake/ThrustInstallRules.cmake
@@ -0,0 +1,58 @@
+# Bring in CMAKE_INSTALL_LIBDIR
+include(GNUInstallDirs)
+
+# Thrust is a header library; no need to build anything before installing:
+set(CMAKE_SKIP_INSTALL_ALL_DEPENDENCY TRUE)
+
+install(DIRECTORY "${Thrust_SOURCE_DIR}/thrust"
+  DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}"
+  FILES_MATCHING
+    PATTERN "*.h"
+    PATTERN "*.inl"
+)
+
+install(DIRECTORY "${Thrust_SOURCE_DIR}/thrust/cmake/"
+  DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/thrust"
+  PATTERN *.cmake.in EXCLUDE
+)
+# Need to configure a file to store the infix specified in
+# CMAKE_INSTALL_INCLUDEDIR since it can be defined by the user
+set(install_location "${CMAKE_INSTALL_LIBDIR}/cmake/thrust")
+configure_file("${Thrust_SOURCE_DIR}/thrust/cmake/thrust-header-search.cmake.in"
+  "${Thrust_BINARY_DIR}/thrust/cmake/thrust-header-search.cmake"
+  @ONLY)
+install(FILES "${Thrust_BINARY_DIR}/thrust/cmake/thrust-header-search.cmake"
+  DESTINATION "${install_location}")
+
+# Depending on how Thrust is configured, libcudacxx and CUB's CMake scripts may
+# or may not be include()'d, so force include their install rules when requested.
+# By default, these projects are installed alongside Thrust. This is controlled by
+# THRUST_INSTALL_CUB_HEADERS and THRUST_INSTALL_LIBCUDACXX_HEADERS.
+option(THRUST_INSTALL_CUB_HEADERS "Include CUB headers when installing." ON)
+if (THRUST_INSTALL_CUB_HEADERS)
+  # Use a function to limit scope of the CUB_*_DIR vars:
+  function(_thrust_install_cub_headers)
+    # Fake these for the logic in CUBInstallRules.cmake:
+    set(CUB_SOURCE_DIR "${Thrust_SOURCE_DIR}/dependencies/cub/")
+    set(CUB_BINARY_DIR "${Thrust_BINARY_DIR}/cub-config/")
+    set(CUB_ENABLE_INSTALL_RULES ON)
+    set(CUB_IN_THRUST OFF)
+    include("${Thrust_SOURCE_DIR}/dependencies/cub/cmake/CubInstallRules.cmake")
+  endfunction()
+
+  _thrust_install_cub_headers()
+endif()
+
+option(THRUST_INSTALL_LIBCUDACXX_HEADERS "Include libcudacxx headers when installing." ON)
+if (THRUST_INSTALL_LIBCUDACXX_HEADERS)
+  # Use a function to limit scope of the libcudacxx_*_DIR vars:
+  function(_thrust_install_libcudacxx_headers)
+    # Fake these for the logic in libcudacxxInstallRules.cmake:
+    set(libcudacxx_SOURCE_DIR "${Thrust_SOURCE_DIR}/dependencies/libcudacxx/")
+    set(libcudacxx_BINARY_DIR "${Thrust_BINARY_DIR}/libcudacxx-config/")
+    set(libcudacxx_ENABLE_INSTALL_RULES ON)
+    include("${Thrust_SOURCE_DIR}/dependencies/libcudacxx/cmake/libcudacxxInstallRules.cmake")
+  endfunction()
+
+  _thrust_install_libcudacxx_headers()
+endif()
diff --git a/cmake/ThrustMultiConfig.cmake b/cmake/ThrustMultiConfig.cmake
new file mode 100644
index 000000000..aa9fc0226
--- /dev/null
+++ b/cmake/ThrustMultiConfig.cmake
@@ -0,0 +1,129 @@
+# This file defines thrust_configure_multiconfig(), which sets up and handles
+# the MultiConfig options that allow multiple host/device/dialect configurations
+# to be generated from a single thrust build.
+
+function(thrust_configure_multiconfig)
+  option(THRUST_ENABLE_MULTICONFIG "Enable multiconfig options for coverage testing." OFF)
+
+  # Dialects:
+  set(THRUST_CPP_DIALECT_OPTIONS
+    11 14 17 20
+    CACHE INTERNAL "C++ dialects supported by Thrust." FORCE
+  )
+
+  if (THRUST_ENABLE_MULTICONFIG)
+    # Handle dialect options:
+    foreach (dialect IN LISTS THRUST_CPP_DIALECT_OPTIONS)
+      set(default_value OFF)
+      if (dialect EQUAL 14) # Default to just 14 on:
+        set(default_value ON)
+      endif()
+      option(THRUST_MULTICONFIG_ENABLE_DIALECT_CPP${dialect}
+        "Generate C++${dialect} build configurations."
+        ${default_value}
+      )
+    endforeach()
+
+    # Option to enable all standards supported by the CUDA and CXX compilers:
+    option(THRUST_MULTICONFIG_ENABLE_DIALECT_ALL
+      "Generate build configurations for all C++ standards supported by the configured compilers."
+      OFF
+    )
+
+    # Option to enable only the most recent supported dialect:
+    option(THRUST_MULTICONFIG_ENABLE_DIALECT_LATEST
+      "Generate a single build configuration for the most recent C++ standard supported by the configured compilers."
+      OFF
+    )
+
+    # Systems:
+    option(THRUST_MULTICONFIG_ENABLE_SYSTEM_CPP "Generate build configurations that use CPP." ON)
+    option(THRUST_MULTICONFIG_ENABLE_SYSTEM_CUDA "Generate build configurations that use CUDA." ON)
+    option(THRUST_MULTICONFIG_ENABLE_SYSTEM_OMP "Generate build configurations that use OpenMP." OFF)
+    option(THRUST_MULTICONFIG_ENABLE_SYSTEM_TBB "Generate build configurations that use TBB." OFF)
+
+    # CMake fixed C++17 support for NVCC + MSVC targets in 3.18.3:
+    if (THRUST_MULTICONFIG_ENABLE_DIALECT_CPP17 AND
+        THRUST_MULTICONFIG_ENABLE_SYSTEM_CUDA)
+      cmake_minimum_required(VERSION 3.18.3)
+    endif()
+
+    # Workload:
+    # - `SMALL`: [3 configs] Minimal coverage and validation of each device system against the `CPP` host.
+    # - `MEDIUM`: [6 configs] Cheap extended coverage.
+    # - `LARGE`: [8 configs] Expensive extended coverage. Include all useful build configurations.
+    # - `FULL`: [12 configs] The complete cross product of all possible build configurations.
+    #
+    # Config   | Workloads | Value      | Expense   | Note
+    # ---------|-----------|------------|-----------|-----------------------------
+    # CPP/CUDA | F L M S   | Essential  | Expensive | Validates CUDA against CPP
+    # CPP/OMP  | F L M S   | Essential  | Cheap     | Validates OMP against CPP
+    # CPP/TBB  | F L M S   | Essential  | Cheap     | Validates TBB against CPP
+    # CPP/CPP  | F L M     | Important  | Cheap     | Tests CPP as device
+    # OMP/OMP  | F L M     | Important  | Cheap     | Tests OMP as host
+    # TBB/TBB  | F L M     | Important  | Cheap     | Tests TBB as host
+    # TBB/CUDA | F L       | Important  | Expensive | Validates TBB/CUDA interop
+    # OMP/CUDA | F L       | Important  | Expensive | Validates OMP/CUDA interop
+    # TBB/OMP  | F         | Not useful | Cheap     | Mixes CPU-parallel systems
+    # OMP/TBB  | F         | Not useful | Cheap     | Mixes CPU-parallel systems
+    # TBB/CPP  | F         | Not Useful | Cheap     | Parallel host, serial device
+    # OMP/CPP  | F         | Not Useful | Cheap     | Parallel host, serial device
+
+    set(THRUST_MULTICONFIG_WORKLOAD SMALL CACHE STRING
+      "Limit host/device configs: SMALL (up to 3 h/d combos per dialect), MEDIUM(6), LARGE(8), FULL(12)"
+    )
+    set_property(CACHE THRUST_MULTICONFIG_WORKLOAD PROPERTY STRINGS
+      SMALL MEDIUM LARGE FULL
+    )
+    set(THRUST_MULTICONFIG_WORKLOAD_SMALL_CONFIGS
+      CPP_OMP CPP_TBB CPP_CUDA
+      CACHE INTERNAL "Host/device combos enabled for SMALL workloads." FORCE
+    )
+    set(THRUST_MULTICONFIG_WORKLOAD_MEDIUM_CONFIGS
+      ${THRUST_MULTICONFIG_WORKLOAD_SMALL_CONFIGS}
+      CPP_CPP TBB_TBB OMP_OMP
+      CACHE INTERNAL "Host/device combos enabled for MEDIUM workloads." FORCE
+    )
+    set(THRUST_MULTICONFIG_WORKLOAD_LARGE_CONFIGS
+      ${THRUST_MULTICONFIG_WORKLOAD_MEDIUM_CONFIGS}
+      OMP_CUDA TBB_CUDA
+      CACHE INTERNAL "Host/device combos enabled for LARGE workloads." FORCE
+    )
+    set(THRUST_MULTICONFIG_WORKLOAD_FULL_CONFIGS
+      ${THRUST_MULTICONFIG_WORKLOAD_LARGE_CONFIGS}
+      OMP_CPP TBB_CPP OMP_TBB TBB_OMP
+      CACHE INTERNAL "Host/device combos enabled for FULL workloads." FORCE
+    )
+
+    # Hide the single config options if they exist from a previous run:
+    if (DEFINED THRUST_HOST_SYSTEM)
+      set_property(CACHE THRUST_HOST_SYSTEM PROPERTY TYPE INTERNAL)
+      set_property(CACHE THRUST_DEVICE_SYSTEM PROPERTY TYPE INTERNAL)
+    endif()
+    if (DEFINED THRUST_CPP_DIALECT)
+      set_property(CACHE THRUST_CPP_DIALECT PROPERTY TYPE INTERNAL)
+    endif()
+
+  else() # Single config:
+    # Restore system option visibility if these cache options already exist
+    # from a previous run.
+    if (DEFINED THRUST_HOST_SYSTEM)
+      set_property(CACHE THRUST_HOST_SYSTEM PROPERTY TYPE STRING)
+      set_property(CACHE THRUST_DEVICE_SYSTEM PROPERTY TYPE STRING)
+    endif()
+
+    set(THRUST_CPP_DIALECT 14
+      CACHE STRING "The C++ standard to target: ${THRUST_CPP_DIALECT_OPTIONS}"
+    )
+    set_property(CACHE THRUST_CPP_DIALECT
+      PROPERTY STRINGS
+      ${THRUST_CPP_DIALECT_OPTIONS}
+    )
+
+    # CMake fixed C++17 support for NVCC + MSVC targets in 3.18.3:
+    if (THRUST_CPP_DIALECT EQUAL 17 AND
+        THRUST_DEVICE_SYSTEM STREQUAL "CUDA")
+      cmake_minimum_required(VERSION 3.18.3)
+    endif()
+  endif()
+endfunction()
diff --git a/cmake/ThrustRunExample.cmake b/cmake/ThrustRunExample.cmake
new file mode 100644
index 000000000..24e9dd2bb
--- /dev/null
+++ b/cmake/ThrustRunExample.cmake
@@ -0,0 +1,49 @@
+# Inputs:
+#
+# Variable             | Type     | Doc
+# ---------------------|----------|--------------------------------------
+# EXAMPLE_EXECUTABLE   | FilePath | Path to example executable
+# FILECHECK_ENABLED    | Boolean  | Run FileCheck comparison test
+# FILECHECK_EXECUTABLE | FilePath | Path to the LLVM FileCheck utility
+# REFERENCE_FILE       | FilePath | Path to the FileCheck reference file
+
+if (FILECHECK_ENABLED)
+  if (NOT EXISTS "${REFERENCE_FILE}")
+    message(FATAL_ERROR
+      "FileCheck requested for '${EXAMPLE_EXECUTABLE}', but reference file "
+      "does not exist at '${REFERENCE_FILE}`."
+    )
+  endif()
+
+  # If the reference file is empty, validate that the example doesn't
+  # produce any output.
+  file(SIZE "${REFERENCE_FILE}" file_size)
+  message("${REFERENCE_FILE}: ${file_size} bytes")
+
+  if (file_size EQUAL 0)
+    set(check_empty_output TRUE)
+    set(filecheck_command)
+  else()
+    set(check_empty_output FALSE)
+    set(filecheck_command COMMAND "${FILECHECK_EXECUTABLE}" "${REFERENCE_FILE}")
+  endif()
+endif()
+
+execute_process(
+  COMMAND "${EXAMPLE_EXECUTABLE}"
+  ${filecheck_command}
+  RESULT_VARIABLE exit_code
+  OUTPUT_VARIABLE stdout
+  ERROR_VARIABLE stderr
+)
+
+if (NOT 0 EQUAL exit_code)
+  message(FATAL_ERROR "${EXAMPLE_EXECUTABLE} failed (${exit_code}):\n${stderr}")
+endif()
+
+if (check_empty_output)
+  string(LENGTH "${stdout}" stdout_size)
+  if (NOT stdout_size EQUAL 0)
+    message(FATAL_ERROR "${EXAMPLE_EXECUTABLE}: output received, but not expected:\n${stdout}")
+  endif()
+endif()
diff --git a/cmake/run_test.cmake b/cmake/ThrustRunTest.cmake
similarity index 100%
rename from cmake/run_test.cmake
rename to cmake/ThrustRunTest.cmake
diff --git a/cmake/ThrustUtilities.cmake b/cmake/ThrustUtilities.cmake
new file mode 100644
index 000000000..6bbb1200a
--- /dev/null
+++ b/cmake/ThrustUtilities.cmake
@@ -0,0 +1,25 @@
+# Given a cu_file (e.g. foo/bar.cu) relative to CMAKE_CURRENT_SOURCE_DIR
+# and a thrust_target, create a cpp file that includes the .cu file, and set
+# ${cpp_file_var} in the parent scope to the full path of the new file. The new
+# file will be generated in:
+# ${CMAKE_CURRENT_BINARY_DIR}/<thrust_target_prefix>/${cu_file}.cpp
+function(thrust_wrap_cu_in_cpp cpp_file_var cu_file thrust_target)
+  thrust_get_target_property(prefix ${thrust_target} PREFIX)
+  set(wrapped_source_file "${CMAKE_CURRENT_SOURCE_DIR}/${cu_file}")
+  set(cpp_file "${CMAKE_CURRENT_BINARY_DIR}/${prefix}/${cu_file}.cpp")
+  configure_file("${Thrust_SOURCE_DIR}/cmake/wrap_source_file.cpp.in" "${cpp_file}")
+  set(${cpp_file_var} "${cpp_file}" PARENT_SCOPE)
+endfunction()
+
+# Enable RDC for a CUDA target. Encapsulates compiler hacks:
+function(thrust_enable_rdc_for_cuda_target target_name)
+  if ("NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}")
+    set_target_properties(${target_name} PROPERTIES
+      COMPILE_FLAGS "-gpu=rdc"
+    )
+  else()
+    set_target_properties(${target_name} PROPERTIES
+      CUDA_SEPARABLE_COMPILATION ON
+    )
+  endif()
+endfunction()
diff --git a/cmake/common_variables.cmake b/cmake/common_variables.cmake
deleted file mode 100644
index 2ff72eb53..000000000
--- a/cmake/common_variables.cmake
+++ /dev/null
@@ -1 +0,0 @@
-set(THRUST_FILECHECK_DATA_PATH "${THRUST_SOURCE}/internal/test")
diff --git a/cmake/detect_compute_archs.cu b/cmake/detect_compute_archs.cu
new file mode 100644
index 000000000..1d30dca4b
--- /dev/null
+++ b/cmake/detect_compute_archs.cu
@@ -0,0 +1,43 @@
+/*
+ *  Copyright 2019-2020 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <cstdio>
+#include <set>
+#include <string>
+
+int main(int argc, char** argv) {
+  std::set<std::string> archs;
+  int devices;
+  if ((cudaGetDeviceCount(&devices) == cudaSuccess) && (devices > 0)) {
+    for (int dev = 0; dev < devices; ++dev) {
+      char buff[32];
+      cudaDeviceProp prop;
+      if(cudaGetDeviceProperties(&prop, dev) != cudaSuccess) continue;
+      sprintf(buff, "%d%d", prop.major, prop.minor);
+      archs.insert(buff);
+    }
+  }
+  if (archs.empty()) {
+    printf("NONE");
+  } else {
+    bool first = true;
+    for(const auto& arch : archs) {
+      printf(first ? "%s" : ";%s", arch.c_str());
+      first = false;
+    }
+  }
+  printf("\n");
+}
diff --git a/cmake/filecheck_smoke_test b/cmake/filecheck_smoke_test
new file mode 100644
index 000000000..aad1b0fd1
--- /dev/null
+++ b/cmake/filecheck_smoke_test
@@ -0,0 +1 @@
+SMOKE
diff --git a/cmake/header_test.in b/cmake/header_test.in
index 4c8ec00f5..250dd5170 100644
--- a/cmake/header_test.in
+++ b/cmake/header_test.in
@@ -1,3 +1,61 @@
+// This source file checks that:
+// 1) Header <thrust/${header}> compiles without error.
+// 2) Common macro collisions with platform/system headers are avoided.
+
+// Turn off failures for certain configurations:
 #define THRUST_CPP11_REQUIRED_NO_ERROR
+#define THRUST_CPP14_REQUIRED_NO_ERROR
 #define THRUST_MODERN_GCC_REQUIRED_NO_ERROR
-#include <thrust/${THRUST_HEADER}>
+
+#ifndef THRUST_IGNORE_MACRO_CHECKS
+
+// Define THRUST_MACRO_CHECK(macro, header), which emits a diagnostic indicating
+// a potential macro collision and halts.
+//
+// Hacky way to build a string, but it works on all tested platforms.
+#define THRUST_MACRO_CHECK(MACRO, HEADER)                                      \
+  THRUST_MACRO_CHECK_IMPL(Identifier MACRO should not be used from Thrust      \
+                          headers due to conflicts with HEADER macros.)
+
+// Use raw platform checks instead of the THRUST_HOST_COMPILER macros since we
+// don't want to #include any headers other than the one being tested.
+//
+// This is only implemented for MSVC/GCC/Clang.
+#if defined(_MSC_VER) // MSVC
+
+// Fake up an error for MSVC
+#define THRUST_MACRO_CHECK_IMPL(msg)                                           \
+  /* Print message that looks like an error: */                                \
+  __pragma(message(__FILE__ ":" THRUST_MACRO_CHECK_IMPL0(__LINE__)             \
+                   ": error: " #msg))                                          \
+  /* abort compilation due to static_assert or syntax error: */                \
+  static_assert(false, #msg);
+#define THRUST_MACRO_CHECK_IMPL0(x) THRUST_MACRO_CHECK_IMPL1(x)
+#define THRUST_MACRO_CHECK_IMPL1(x) #x
+
+#elif defined(__clang__) || defined(__GNUC__)
+
+// GCC/clang are easy:
+#define THRUST_MACRO_CHECK_IMPL(msg) THRUST_MACRO_CHECK_IMPL0(GCC error #msg)
+#define THRUST_MACRO_CHECK_IMPL0(expr) _Pragma(#expr)
+
+#endif
+
+// complex.h conflicts
+#define I THRUST_MACRO_CHECK('I', complex.h)
+
+// windows.h conflicts
+#define small THRUST_MACRO_CHECK('small', windows.h)
+// We can't enable these checks without breaking some builds -- some standard
+// library implementations unconditionally `#undef` these macros, which then
+// causes random failures later.
+// Leaving these commented out as a warning: Here be dragons.
+//#define min(...) THRUST_MACRO_CHECK('min', windows.h)
+//#define max(...) THRUST_MACRO_CHECK('max', windows.h)
+
+// termios.h conflicts (NVIDIA/thrust#1547)
+#define B0 THRUST_MACRO_CHECK("B0", termios.h)
+
+#endif // THRUST_IGNORE_MACRO_CHECKS
+
+#include <thrust/${header}>
diff --git a/cmake/run_example.cmake b/cmake/run_example.cmake
deleted file mode 100644
index d51152d1e..000000000
--- a/cmake/run_example.cmake
+++ /dev/null
@@ -1,34 +0,0 @@
-include("${THRUST_SOURCE}/cmake/common_variables.cmake")
-
-if (THRUST_FILECHECK_ENABLED)
-  set(DATA_FILE "${THRUST_FILECHECK_DATA_PATH}/${THRUST_EXAMPLE}.filecheck")
-  file(READ "${DATA_FILE}" CONTENTS)
-  string(LENGTH "${CONTENTS}" LENGTH)
-  message(${LENGTH})
-
-  if (NOT ${LENGTH} EQUAL 0)
-    set(FILECHECK_COMMAND
-      COMMAND "${THRUST_FILECHECK}" "${THRUST_FILECHECK_DATA_PATH}/${THRUST_EXAMPLE}.filecheck")
-  else ()
-    set(CHECK_EMPTY_OUTPUT TRUE)
-  endif ()
-endif ()
-
-execute_process(
-  COMMAND "${THRUST_BINARY}"
-  ${FILECHECK_COMMAND}
-  RESULT_VARIABLE EXIT_CODE
-  OUTPUT_VARIABLE STDOUT
-  ERROR_VARIABLE STDERR
-)
-
-if (NOT "0" STREQUAL "${EXIT_CODE}")
-  message(FATAL_ERROR "${THRUST_BINARY} failed (${EXIT_CODE}):\n${STDERR}")
-endif ()
-
-if (CHECK_EMPTY_OUTPUT)
-  string(LENGTH "${OUTPUT_VARIABLE}" LENGTH)
-  if (NOT ${LENGTH} EQUAL 0)
-    message(FATAL_ERROR "${THRUST_BINARY}: output received, but not expected.")
-  endif ()
-endif ()
diff --git a/cmake/sanity b/cmake/sanity
deleted file mode 100644
index f9db80b7f..000000000
--- a/cmake/sanity
+++ /dev/null
@@ -1 +0,0 @@
-SANITY
diff --git a/cmake/wrap_source_file.cpp.in b/cmake/wrap_source_file.cpp.in
new file mode 100644
index 000000000..3015238cc
--- /dev/null
+++ b/cmake/wrap_source_file.cpp.in
@@ -0,0 +1 @@
+#include <${wrapped_source_file}>
diff --git a/dependencies/cub b/dependencies/cub
new file mode 160000
index 000000000..b2e8bccb8
--- /dev/null
+++ b/dependencies/cub
@@ -0,0 +1 @@
+Subproject commit b2e8bccb8c0cd15279974fe4b9b8d6fcd1842b57
diff --git a/dependencies/libcudacxx b/dependencies/libcudacxx
new file mode 160000
index 000000000..55dd2c993
--- /dev/null
+++ b/dependencies/libcudacxx
@@ -0,0 +1 @@
+Subproject commit 55dd2c99346baa3a14949a0f7e9c41865e434eda
diff --git a/doc/branching.md b/doc/branching.md
deleted file mode 100644
index 947ab1062..000000000
--- a/doc/branching.md
+++ /dev/null
@@ -1,125 +0,0 @@
-# Thrust Branching and Development Model
-
-The following is a description of how the Thrust development teams approaches branching and release tagging. This
-is a living document that will evolve as our process evolves.
-
-## Thrust Version
-
-Thrust has historically had its own versioning system, independent of the versioning scheme of the CUDA Toolkit.
-Today, Thrust is released with the CUDA Toolkit, but we currently still maintain the double versioning scheme.
-
-The following is a mapping from Thrust versions to CUDA Toolkit versions and vice versa. Note that some Thrust
-versions don't directly map to any CUDA Toolkit version.
-
-| Thrust version    | CUDA version  |
-| ----------------- | ------------- |
-| 1.9.5             | 10.1 Update 1 |
-| 1.9.4             | 10.1          |
-| 1.9.3             | 10.0          |
-| 1.9.2             | 9.2           |
-| 1.9.1             | 9.1           |
-| 1.9.0             | 9.0           |
-| 1.8.3             | 8.0           |
-| 1.8.2             | 7.5           |
-| 1.8.1             | 7.0           |
-| 1.8.0             | *N/A*         |
-| 1.7.2             | 6.5           |
-| 1.7.1             | 6.0           |
-| 1.7.0             | 5.5           |
-| 1.6.0             | *N/A*         |
-| 1.5.3             | 5.0           |
-| 1.5.2             | 4.2           |
-| 1.5.1             | 4.1           |
-| 1.5.0             | *N/A*         |
-| 1.4.0             | 4.0           |
-| 1.3.0             | 3.2           |
-| 1.2.1             | 3.1           |
-| 1.2.0             | *N/A*         |
-| 1.1.1             | *N/A*         |
-| 1.1.0             | *N/A*         |
-| 1.0.0             | *N/A*         |
-
-## Repositories
-
-As Thrust is developed both on GitHub and internally at NVIDIA, there's three main places where code lives:
-
-  * The [public Thrust repository](https://github.com/thrust/thrust), referred to as `github` later in this
-    document.
-  * An internal GitLab repository, referred to as `gitlab` later in this document.
-  * An internal Perforce repository, referred to as `perforce` later in this document.
-
-## Branches and Tags
-
-The following tag names are used in the Thrust project:
-
-  * `github/cuda-X.Y`: the tag that directly corresponds to what has been shipped in the CUDA Toolkit release X.Y.
-  * `github/A.B.C`: the tag that directly corresponds to a Thrust version A.B.C.
-
-The following branch names are used in the Thrust project:
-
-  * `github/master`: the Source of Truth development branch of Thrust.
-  * `github/old-master`: the old Source of Truth branch, before unification of public and internal repositories.
-  * `perforce/private`: mirrored github/master, plus files necessary for internal NVIDIA testing systems.
-  * `gitlab/staging/cuda-X.Y`: the branch for a CUDA Toolkit release that has not been released yet. cuda-X.Y should
-    be tagged on this branch after the final commit freeze (see "Release branches" below).
-  * `github/maintenance/cuda-Z.W`: the continuation of gitlab/staging/cuda-Z.W, but after release of CUDA Z.W, plus
-    post-release fixes if any are needed (see "Old release branches" below).
-  * `gitlab/feature/<name>`: feature branch for internally developed features.
-  * `gitlab/bug/<bug-system>-<bug-id>`: bug fix branch, where `bug-system` is `github` or `nvbug`. Permits a description
-    after `bug-id`.
-  * `gitlab/master`: same as `github/master`, but not yet published, during a freezing period (see "Feature freeze"
-    below).
-
-## Development Process Described
-
-### Normal development
-
-During regular parts of the development cycle, when we develop features on feature branches, and fix bugs on the
-main branch, we can:
-
-  * Merge internal fixes to `github/master` and to `perforce/private`.
-  * Merge Github contributions to `github/master` and to `perforce/private`.
-
-### Feature freeze
-
-In case where we have a new feature for a CUDA Toolkit release: just before the CUDA Toolkit feature freeze for a
-new release branch, we should stop merging commits (including public contributions) to `github/master`, and move to
-development on `gitlab/master`, and merge the not yet public features there.
-
-In those cases, we should wait until the new version of the toolkit is released before we push the new updated
-`gitlab/master` to `github/master`, roughly at the same time as we push from `gitlab/staging/cuda-X.Y` to
-`github/maintenance/cuda-X.Y` and tag `cuda-X.Y`, and the appropriate Thrust version tag.
-
-If we don't have big, not-public-before-release features landing in X.Y, however, we can avoid having a feature
-freeze period.
-
-The reason for having a freeze period at all is: `github/master` is supposed to be the Source of Truth. We want the
-history to follow the same order of commits in both Git and Perforce, and once a change is merged, we cannot rebase
-things that went into `perforce/internal` on top of it. Therefore: since we only really commit to Perforce but not
-`github/master` when we have a feature that is ready to be delivered, but is only a part of a new release and
-shouldn't/can't be public yet, we have to make sure that after it is merged to `gitlab/master` (and to `perforce/internal`),
-nothing new lands in `github/master` before we push the feature out.
-
-To avoid situations like this with bug fixes, when we fix a bug at a not crazy point in the release cycle, we
-should develop it on git, merge/push it on Github, and then pull the new commit to Perforce.
-
-### Release branches
-
-These are the internal Git branches that map directly to internal CUDA release branches. These branches are primarily
-developed in Git, and commits applied to them are then pushed to Perforce.
-
-After a CUDA Toolkit version is released, these transition to being old release branches.
-
-### Old release branches
-
-These branches represent a version that has landed in a CUDA Toolkit version, but with bugfixes for things that do
-deserve being fixed on a release branch. These shouldn't be groundbreaking; the following are an acceptable set of
-fixes to go into these branches, because they can remove annoyances, but shouldn't change behavior:
-
-  * Documentation fixes and updates.
-  * Thrust build system changes.
-  * Additional examples, fixes to examples and tests.
-  * (Possibly:) Fixing missing headers. This one is slightly less obvious, because it makes it possible for users
-    of standalone Thrust to write programs that won't compile with CUDA Thrust. Determinations will be made on a
-    case by case basis.
-
diff --git a/doc/changelog.md b/doc/changelog.md
deleted file mode 100644
index 98923388a..000000000
--- a/doc/changelog.md
+++ /dev/null
@@ -1,1192 +0,0 @@
-# Thrust v1.9.5  (CUDA 10.1 Update 1)
-
-## Summary
- 
-Thrust 1.9.5 is a minor release accompanying the CUDA 10.1 Update 1 release.
-
-## Bug Fixes
-
-- NVBug 2502854: Fixed assignment of
-    `thrust::device_vector<thrust::complex<T>>` between host and device.
-
-# Thrust 1.9.4 (CUDA 10.1)
-
-## Summary
-
-Thrust 1.9.4 adds asynchronous interfaces for parallel algorithms, a new
-  allocator system including caching allocators and unified memory support, as
-  well as a variety of other enhancements, mostly related to
-  C++11/C++14/C++17/C++20 support.
-The new asynchronous algorithms in the `thrust::async` namespace return
-  `thrust::event` or `thrust::future` objects, which can be waited upon to
-  synchronize with the completion of the parallel operation.
-
-## Breaking Changes
-
-Synchronous Thrust algorithms now block until all of their operations have
-  completed.
-Use the new asynchronous Thrust algorithms for non-blocking behavior.
-
-## New Features
-
-- `thrust::event` and `thrust::future<T>`, uniquely-owned asynchronous handles
-    consisting of a state (ready or not ready), content (some value; for
-    `thrust::future` only), and an optional set of objects that should be
-    destroyed only when the future's value is ready and has been consumed.
-  - The design is loosely based on C++11's `std::future`.
-  - They can be `.wait`'d on, and the value of a future can be waited on and
-      retrieved with `.get` or `.extract`.
-  - Multiple `thrust::event`s and `thrust::future`s can be combined with
-      `thrust::when_all`.
-  - `thrust::future`s can be converted to `thrust::event`s.
-  - Currently, these primitives are only implemented for the CUDA backend and
-      are C++11 only.
-- New asynchronous algorithms that return `thrust::event`/`thrust::future`s,
-    implemented as C++20 range style customization points:
-    - `thrust::async::reduce`.
-    - `thrust::async::reduce_into`, which takes a target location to store the
-        reduction result into.
-    - `thrust::async::copy`, including a two-policy overload that allows
-        explicit cross system copies which execution policy properties can be
-        attached to.
-    - `thrust::async::transform`.
-    - `thrust::async::for_each`.
-    - `thrust::async::stable_sort`.
-    - `thrust::async::sort`.
-    - By default the asynchronous algorithms use the new caching allocators.
-        Deallocation of temporary storage is deferred until the destruction of
-        the returned `thrust::future`. The content of `thrust::future`s is
-        stored in either device or universal memory and transferred to the host
-        only upon request to prevent unnecessary data migration.
-    - Asynchronous algorithms are currently only implemented for the CUDA
-        system and are C++11 only.
-- `exec.after(f, g, ...)`, a new execution policy method that takes a set of
-    `thrust::event`/`thrust::future`s and returns an execution policy that
-    operations on that execution policy should depend upon. 
-- New logic and mindset for the type requirements for cross-system sequence
-    copies (currently only used by `thrust::async::copy`), based on:
-  - `thrust::is_contiguous_iterator` and `THRUST_PROCLAIM_CONTIGUOUS_ITERATOR`
-      for detecting/indicating that an iterator points to contiguous storage.
-  - `thrust::is_trivially_relocatable` and
-      `THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE` for detecting/indicating that a
-      type is `memcpy`able (based on principles from
-      [P1144](https://wg21.link/P1144)).
-  - The new approach reduces buffering, increases performance, and increases
-      correctness.
-  - The fast path is now enabled when copying CUDA `__half` and vector types with
-      `thrust::async::copy`.
-- All Thrust synchronous algorithms for the CUDA backend now actually
-    synchronize. Previously, any algorithm that did not allocate temporary
-    storage (counterexample: `thrust::sort`) and did not have a
-    computation-dependent result (counterexample: `thrust::reduce`) would
-    actually be launched asynchronously. Additionally, synchronous algorithms
-    that allocated temporary storage would become asynchronous if a custom
-    allocator was supplied that did not synchronize on allocation/deallocation,
-    unlike `cudaMalloc`/`cudaFree`. So, now `thrust::for_each`,
-    `thrust::transform`, `thrust::sort`, etc are truly synchronous. In some
-    cases this may be a performance regression; if you need asynchrony, use the
-    new asynchronous algorithms.
-- Thrust's allocator framework has been rewritten. It now uses a memory
-    resource system, similar to C++17's `std::pmr` but supporting static
-    polymorphism. Memory resources are objects that allocate untyped storage and
-    allocators are cheap handles to memory resources in this new model. The new
-    facilities live in `<thrust/mr/*>`.
-  - `thrust::mr::memory_resource<Pointer>`, the memory resource base class,
-      which takes a (possibly tagged) pointer to `void` type as a parameter.
-  - `thrust::mr::allocator<T, MemoryResource>`, an allocator backed by a memory
-      resource object.
-  - `thrust::mr::polymorphic_adaptor_resource<Pointer>`, a type-erased memory
-      resource adaptor.
-  - `thrust::mr::polymorphic_allocator<T>`, a C++17-style polymorphic allocator
-      backed by a type-erased memory resource object.
-  - New tunable C++17-style caching memory resources,
-      `thrust::mr::(disjoint_)?(un)?synchronized_pool_resource`, designed to
-      cache both small object allocations and large repetitive temporary
-      allocations. The disjoint variants use separate storage for management of
-      the pool, which is necessary if the memory being allocated cannot be
-      accessed on the host (e.g.  device memory).
-  - System-specific allocators were rewritten to use the new memory resource
-      framework.
-  - New `thrust::device_memory_resource` for allocating device memory.    
-  - New `thrust::universal_memory_resource` for allocating memory that can be
-      accessed from both the host and device (e.g. `cudaMallocManaged`).
-  - New `thrust::universal_host_pinned_memory_resource` for allocating memory
-      that can be accessed from the host and the device but always resides in
-      host memory (e.g. `cudaMallocHost`).
-  - `thrust::get_per_device_resource` and `thrust::per_device_allocator`, which
-      lazily create and retrieve a per-device singleton memory resource.
-  - Rebinding mechanisms (`rebind_traits` and `rebind_alloc`) for
-      `thrust::allocator_traits`.
-  - `thrust::device_make_unique`, a factory function for creating a
-      `std::unique_ptr` to a newly allocated object in device memory.
-  - `<thrust/detail/memory_algorithms>`, a C++11 implementation of the C++17
-      uninitialized memory algorithms.
-  - `thrust::allocate_unique` and friends, based on the proposed C++23
-      [`std::allocate_unique`](https://wg21.link/P0211).
-- New type traits and metaprogramming facilities. Type traits are slowly being
-    migrated out of `thrust::detail::` and `<thrust/detail/*>`; their new home
-    will be `thrust::` and `<thrust/type_traits/*>`.
-  - `thrust::is_execution_policy`.
-  - `thrust::is_operator_less_or_greater_function_object`, which detects
-      `thrust::less`, `thrust::greater`, `std::less`, and `std::greater`.
-  - `thrust::is_operator_plus_function_object``, which detects `thrust::plus`
-      and `std::plus`.
-  - `thrust::remove_cvref(_t)?`, a C++11 implementation of C++20's
-      `thrust::remove_cvref(_t)?`.
-  - `thrust::void_t`, and various other new type traits.
-  - `thrust::integer_sequence` and friends, a C++11 implementation of C++20's
-      `std::integer_sequence`
-  - `thrust::conjunction`, `thrust::disjunction`, and `thrust::disjunction`, a
-      C++11 implementation of C++17's logical metafunctions.
-  - Some Thrust type traits (such as `thrust::is_constructible`) have been
-      redefined in terms of C++11's type traits when they are available.
-- `<thrust/detail/tuple_algorithms.h>`, new `std::tuple` algorithms:
-  - `thrust::tuple_transform`.
-  - `thrust::tuple_for_each`.
-  - `thrust::tuple_subset`.
-- Miscellaneous new `std::`-like facilities:
-  - `thrust::optional`, a C++11 implementation of C++17's `std::optional`.
-  - `thrust::addressof`, an implementation of C++11's `std::addressof`.
-  - `thrust::next` and `thrust::prev`, an implementation of C++11's `std::next`
-      and `std::prev`.
-  - `thrust::square`, a `<functional>` style unary function object that
-      multiplies its argument by itself.
-  - `<thrust/limits.h>` and `thrust::numeric_limits`, a customized version of
-      `<limits>` and `std::numeric_limits`.
-- `<thrust/detail/preprocessor.h>`, new general purpose preprocessor facilities:
-  - `THRUST_PP_CAT[2-5]`, concatenates two to five tokens.
-  - `THRUST_PP_EXPAND(_ARGS)?`, performs double expansion.
-  - `THRUST_PP_ARITY` and `THRUST_PP_DISPATCH`, tools for macro overloading.
-  - `THRUST_PP_BOOL`, boolean conversion.
-  - `THRUST_PP_INC` and `THRUST_PP_DEC`, increment/decrement.
-  - `THRUST_PP_HEAD`, a variadic macro that expands to the first argument.
-  - `THRUST_PP_TAIL`, a variadic macro that expands to all its arguments after
-      the first.
-  - `THRUST_PP_IIF`, bitwise conditional.
-  - `THRUST_PP_COMMA_IF`, and `THRUST_PP_HAS_COMMA`, facilities for adding and
-      detecting comma tokens.
-  - `THRUST_PP_IS_VARIADIC_NULLARY`, returns true if called with a nullary
-      `__VA_ARGS__`.
-  - `THRUST_CURRENT_FUNCTION`, expands to the name of the current function.
-- New C++11 compatibility macros:
-  - `THRUST_NODISCARD`, expands to `[[nodiscard]]` when available and the best
-      equivalent otherwise.
-  - `THRUST_CONSTEXPR`, expands to `constexpr` when available and the best
-      equivalent otherwise.
-  - `THRUST_OVERRIDE`, expands to `override` when available and the best
-      equivalent otherwise.
-  - `THRUST_DEFAULT`, expands to `= default;` when available and the best
-      equivalent otherwise.
-  - `THRUST_NOEXCEPT`, expands to `noexcept` when available and the best
-      equivalent otherwise.
-  - `THRUST_FINAL`, expands to `final` when available and the best equivalent
-      otherwise.
-  - `THRUST_INLINE_CONSTANT`, expands to `inline constexpr` when available and
-      the best equivalent otherwise.
-- `<thrust/detail/type_deduction.h>`, new C++11-only type deduction helpers:
-  - `THRUST_DECLTYPE_RETURNS*`, expand to function definitions with suitable
-      conditional `noexcept` qualifiers and trailing return types.
-  - `THRUST_FWD(x)`, expands to `::std::forward<decltype(x)>(x)`.
-  - `THRUST_MVCAP`, expands to a lambda move capture.
-  - `THRUST_RETOF`, expands to a decltype computing the return type of an
-      invocable.
-- New CMake build system.
-   
-## New Examples
-
-- `mr_basic` demonstrates how to use the new memory resource allocator system.
-
-## Other Enhancements
-
-- Tagged pointer enhancements:
-  - New `thrust::pointer_traits` specialization for `void const*`.
-  - `nullptr` support to Thrust tagged pointers.
-  - New `explicit operator bool` for Thrust tagged pointers when using C++11
-      for `std::unique_ptr` interoperability.
-  - Added `thrust::reinterpret_pointer_cast` and `thrust::static_pointer_cast`
-      for casting Thrust tagged pointers.
-- Iterator enhancements:
-  - `thrust::iterator_system` is now SFINAE friendly.
-  - Removed cv qualifiers from iterator types when using
-      `thrust::iterator_system`.
-- Static assert enhancements:
-  - New `THRUST_STATIC_ASSERT_MSG`, takes an optional string constant to be
-      used as the error message when possible.
-  - Update `THRUST_STATIC_ASSERT(_MSG)` to use C++11's `static_assert` when
-      it's available.
-  - Introduce a way to test for static assertions.
-- Testing enhancements:
-  - Additional scalar and sequence types, including non-builtin types and
-      vectors with unified memory allocators, have been added to the list of
-      types used by generic unit tests.
-  - The generation of random input data has been improved to increase the range
-      of values used and catch more corner cases.
-  - New `unittest::truncate_to_max_representable` utility for avoiding the
-      generation of ranges that cannot be represented by the underlying element
-      type in generic unit test code. 
-  - The test driver now synchronizes with CUDA devices and check for errors
-      after each test, when switching devices, and after each raw kernel launch.
-  - The `warningtester` uber header is now compiled with NVCC to avoid needing
-      to disable CUDA-specific code with the preprocessor.
-  - Fixed the unit test framework's `ASSERT_*` to print `char`s as `int`s.
-  - New `DECLARE_INTEGRAL_VARIABLE_UNITTEST` test declaration macro.
-  - New `DECLARE_VARIABLE_UNITTEST_WITH_TYPES_AND_NAME` test declaration macro.
-  - `thrust::system_error` in the CUDA backend now print out its `cudaError_t`
-      enumerator in addition to the diagnostic message.
-  - Stopped using conditionally signed types like `char`.
-
-## Bug Fixes
-
-- #897, NVBug 2062242: Fix compilation error when using `__device__` lambdas
-    with `thrust::reduce` on MSVC.
-- #908, NVBug 2089386: Static assert that `thrust::generate`/`thrust::fill`
-    isn't operating on const iterators.
-- #919 Fix compilation failure with `thrust::zip_iterator` and
-    `thrust::complex`.
-- #924, NVBug 2096679, NVBug 2315990: Fix dispatch for the CUDA backend's
-    `thrust::reduce` to use two functions (one with the pragma for disabling
-    exec checks, one with `THRUST_RUNTIME_FUNCTION`) instead of one. This fixes
-    a regression with device compilation that started in CUDA 9.2.
-- #928, NVBug 2341455: Add missing `__host__ __device__` annotations to a
-    `thrust::complex::operator=` to satisfy GoUDA.
-- NVBug 2094642: Make `thrust::vector_base::clear` not depend on the element
-    type being default constructible.
-- NVBug 2289115: Remove flaky `simple_cuda_streams` example.
-- NVBug 2328572: Add missing `thrust::device_vector` constructor that takes an
-    allocator parameter.
-- NVBug 2455740: Update the `range_view` example to not use device-side launch.
-- NVBug 2455943: Ensure that sized unit tests that use
-    `thrust::counting_iterator` perform proper truncation.
-- NVBug 2455952: Refactor questionable `thrust::copy_if` unit tests.
-
-# Thrust 1.9.3 (CUDA 10.0)     
-
-## Summary
-
-Thrust 1.9.3 unifies and integrates CUDA Thrust and GitHub Thrust.
-
-## Bug Fixes
-
-- #725, #850, #855, #859, #860: Unify the `thrust::iter_swap` interface and fix
-    `thrust::device_reference` swapping.
-- NVBug 2004663: Add a `data` method to `thrust::detail::temporary_array` and
-    refactor temporary memory allocation in the CUDA backend to be exception
-    and leak safe.
-- #886, #894, #914: Various documentation typo fixes.
-- #724: Provide `NVVMIR_LIBRARY_DIR` environment variable to NVCC.
-- #878: Optimize `thrust::min/max_element` to only use
-    `thrust::detail::get_iterator_value` for non-numeric types.
-- #899: Make `thrust::cuda::experimental::pinned_allocator`'s comparison
-    operators `const`.
-- NVBug 2092152: Remove all includes of `<cuda.h>`.
-- #911: Fix default comparator element type for `thrust::merge_by_key`. 
-
-## Acknowledgments
-
-- Thanks to Andrew Corrigan for contributing fixes for swapping interfaces.
-- Thanks to Francisco Facioni for contributing optimizations for
-    `thrust::min/max_element`.
-
-# Thrust 1.9.2 (CUDA 9.2)      
-
-## Summary
-
-Thrust 1.9.2 brings a variety of performance enhancements, bug fixes and test
-  improvements.
-CUB 1.7.5 was integrated, enhancing the performance of `thrust::sort` on
-  small data types and `thrust::reduce`.
-Changes were applied to `complex` to optimize memory access.
-Thrust now compiles with compiler warnings enabled and treated as errors.
-Additionally, the unit test suite and framework was enhanced to increase
-  coverage.
-
-## Breaking Changes
-
-- The `fallback_allocator` example was removed, as it was buggy and difficult
-    to support.
-
-## New Features
-
-- `<thrust/detail/alignment.h>`, utilities for memory alignment:
-  - `thrust::aligned_reinterpret_cast`.
-  - `thrust::aligned_storage_size`, which computes the amount of storage needed
-      for an object of a particular size and alignment.
-  - `thrust::alignment_of`, a C++03 implementation of C++11's
-      `std::alignment_of`. 
-  - `thrust::aligned_storage`, a C++03 implementation of C++11's
-      `std::aligned_storage`. 
-  - `thrust::max_align_t`, a C++03 implementation of C++11's
-      `std::max_align_t`. 
-
-## Bug Fixes
-- NVBug 200385527, NVBug 200385119, NVBug 200385113, NVBug 200349350, NVBug
-    2058778: Various compiler warning issues.
-- NVBug 200355591: `thrust::reduce` performance issues.
-- NVBug 2053727: Fixed an ADL bug that caused user-supplied `allocate` to be
-    overlooked but `deallocate` to be called with GCC <= 4.3.
-- NVBug 1777043: Fixed `thrust::complex` to work with `thrust::sequence`.
-
-# Thrust 1.9.1 (CUDA 9.1)      
-
-## Summary
-
-Thrust 1.9.1 integrates version 1.7.4 of CUB and introduces a new CUDA backend
-for `thrust::reduce` based on CUB.
-
-## Bug Fixes
-
-- NVBug 1965743: Remove unnecessary static qualifiers.
-- NVBug 1940974: Fix regression causing a compilation error when using
-    `thrust::merge_by_key` with `thrust::constant_iterator`s.
-- NVBug 1904217: Allow callables that take non-const refs to be used with
-    `thrust::reduce` and `thrust::*_scan`.
-
-# Thrust 1.9.0 (CUDA 9.0)      
-
-## Summary
-
-Thrust 1.9.0 replaces the original CUDA backend (bulk) with a new one
-  written using CUB, a high performance CUDA collectives library.
-This brings a substantial performance improvement to the CUDA backend across
-  the board.
-
-## Breaking Changes
-
-- Any code depending on CUDA backend implementation details will likely be
-    broken.
-
-## New Features
-
-- New CUDA backend based on CUB which delivers substantially higher performance.
-- `thrust::transform_output_iterator`, a fancy iterator that applies a function
-    to the output before storing the result. 
-
-## New Examples
-
-- `transform_output_iterator` demonstrates use of the new fancy iterator
-    `thrust::transform_output_iterator`.
-
-## Other Enhancements
-
-- When C++11 is enabled, functors do not have to inherit from
-    `thrust::(unary|binary)_function` anymore to be used with
-    `thrust::transform_iterator`. 
-- Added C++11 only move constructors and move assignment operators for
-    `thrust::detail::vector_base`-based classes, e.g. `thrust::host_vector`,
-    `thrust::device_vector`, and friends.
-
-## Bug Fixes
-
-- `sin(thrust::complex<double>)` no longer has precision loss to float.
-
-## Acknowledgments
-
-- Thanks to Manuel Schiller for contributing a C++11 based enhancement
-    regarding the deduction of functor return types, improving the performance
-    of `thrust::unique` and implementing `thrust::transform_output_iterator`.
-- Thanks to Thibault Notargiacomo for the implementation of move semantics for 
-    the `thrust::vector_base`-based classes.
-- Thanks to Duane Merrill for developing CUB and helping to integrate it into
-    Thrust's backend.
-
-# Thrust 1.8.3 (CUDA 8.0)      
-
-Thrust 1.8.3 is a small bug fix release.
-
-## New Examples
-
-- `range_view` demonstrates the use of a view (a non-owning wrapper for an
-    iterator range with a container-like interface).
-
-## Bug Fixes
-
-- `thrust::(min|max|minmax)_element` can now accept raw device pointers when 
-    an explicit device execution policy is used.
-- `thrust::clear` operations on vector types no longer requires the element
-    type to have a default constructor.
-
-# Thrust 1.8.2 (CUDA 7.5)      
-
-Thrust 1.8.2 is a small bug fix release.
-
-## Bug Fixes
-
-- Avoid warnings and errors concerning user functions called from
-    `__host__ __device__` functions.
-- #632: Fix an error in `thrust::set_intersection_by_key` with the CUDA backend.
-- #651: `thrust::copy` between host and device now accepts execution policies
-    with streams attached, i.e. `thrust::::cuda::par.on(stream)`.
-- #664: `thrust::for_each` and algorithms based on it no longer ignore streams
-    attached to execution policys.
-
-## Known Issues
-
-- #628: `thrust::reduce_by_key` for the CUDA backend fails for Compute
-    Capability 5.0 devices.
-
-# Thrust 1.8.1 (CUDA 7.0)      
-
-Thrust 1.8.1 is a small bug fix release.
-
-## Bug Fixes
-
-- #615, #620: Fixed `thrust::for_each` and `thrust::reduce` to no longer fail on
-    large inputs.
-
-## Known Issues
-
-- #628: `thrust::reduce_by_key` for the CUDA backend fails for Compute
-    Capability 5.0 devices.
-
-# Thrust 1.8.0            
-
-Summary
-- Thrust 1.8.0 introduces support for algorithm invocation from CUDA __device__ code, support for CUDA streams,
-- and algorithm performance improvements. Users may now invoke Thrust algorithms from CUDA __device__ code,
-- providing a parallel algorithms library to CUDA programmers authoring custom kernels, as well as allowing
-- Thrust programmers to nest their algorithm calls within functors. The thrust::seq execution policy
-- allows users to require sequential algorithm execution in the calling thread and makes a
-- sequential algorithms library available to individual CUDA threads. The .on(stream) syntax allows users to
-- request a CUDA stream for kernels launched during algorithm execution. Finally, new CUDA algorithm
-- implementations provide substantial performance improvements.
-
-## New Features
-- Algorithms in CUDA __device__ code
-      Thrust algorithms may now be invoked from CUDA __device__ and __host__ __device__ functions.
-
-      Algorithms invoked in this manner must be invoked with an execution policy as the first parameter:
-
-      __device__ int my_device_sort(int *data, size_t n)
-      {
-        thrust::sort(thrust::device, data, data + n);
-      }
-
-      The following execution policies are supported in CUDA __device__ code:
-        thrust::seq
-        thrust::cuda::par
-        thrust::device, when THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
-
-      Parallel algorithm execution may not be accelerated unless CUDA Dynamic Parallelism is available.
-
-- Execution Policies
-      CUDA Streams
-        The thrust::cuda::par.on(stream) syntax allows users to request that CUDA __global__ functions launched during algorithm 
-        execution should occur on a given stream:
-
-        // execute for_each on stream s
-        thrust::for_each(thrust::cuda::par.on(s), begin, end, my_functor);
-
-        Algorithms executed with a CUDA stream in this manner may still synchronize with other streams when allocating temporary
-        storage or returning results to the CPU.
-
-      thrust::seq
-        The thrust::seq execution policy allows users to require that an algorithm execute sequentially in the calling thread:
-
-        // execute for_each sequentially in this thread
-        thrust::for_each(thrust::seq, begin, end, my_functor);
-        
-- Other
-      The new thrust::complex template provides complex number support.
-
-## New Examples
-- simple_cuda_streams demonstrates how to request a CUDA stream during algorithm execution.
-- async_reduce demonstrates ways to achieve algorithm invocations which are asynchronous with the calling thread.
-
-## Other Enhancements
-- CUDA sort performance for user-defined types is 300% faster on Tesla K20c for large problem sizes.
-- CUDA merge performance is 200% faster on Tesla K20c for large problem sizes.
-- CUDA sort performance for primitive types is 50% faster on Tesla K20c for large problem sizes.
-- CUDA reduce_by_key performance is 25% faster on Tesla K20c for large problem sizes.
-- CUDA scan performance is 15% faster on Tesla K20c for large problem sizes.
-- fallback_allocator example is simpler.
-
-## Bug Fixes
-- #364 iterators with unrelated system tags may be used with algorithms invoked with an execution policy
-- #371 do not redefine __CUDA_ARCH__
-- #379 fix crash when dereferencing transform_iterator on the CPU
-- #391 avoid use of uppercase variable names
-- #392 fix thrust::copy between cusp::complex & std::complex
-- #396 program compiled with gcc < 4.3 hangs during comparison sort
-- #406 fallback_allocator.cu example checks device for unified addressing support
-- #417 avoid using std::less<T> in binary search algorithms
-- #418 avoid various warnings
-- #443 including version.h no longer configures default systems
-- #578 nvcc produces warnings when sequential algorithms are used with cpu systems
-
-## Known Issues
-- When invoked with primitive data types, thrust::sort, thrust::sort_by_key, thrust::stable_sort, & thrust::stable_sort_by_key may
-- fail to link in some cases with nvcc -rdc=true.
-
-- The CUDA implementation of thrust::reduce_by_key incorrectly outputs the last element in a segment of equivalent keys instead of the first.
-
-Acknowledgments
-- Thanks to Sean Baxter for contributing faster CUDA reduce, merge, and scan implementations.
-- Thanks to Duane Merrill for contributing a faster CUDA radix sort implementation.
-- Thanks to Filipe Maia for contributing the implementation of thrust::complex.
-
-# Thrust 1.7.2 (CUDA 6.5)      
-
-Summary
-- Small bug fixes
-
-## Bug Fixes
-- Avoid use of std::min in generic find implementation
-
-# Thrust 1.7.1 (CUDA 6.0)      
-
-Summary
-- Small bug fixes
-
-## Bug Fixes
-- Eliminate identifiers in set_operations.cu example with leading underscore
-- Eliminate unused variable warning in CUDA reduce_by_key implementation
-- Avoid deriving function objects from std::unary_function and std::binary_function
-
-# Thrust 1.7.0 (CUDA 5.5)      
-
-Summary
-- Thrust 1.7.0 introduces a new interface for controlling algorithm execution as
-- well as several new algorithms and performance improvements. With this new
-- interface, users may directly control how algorithms execute as well as details
-- such as the allocation of temporary storage. Key/value versions of thrust::merge
-- and the set operation algorithms have been added, as well stencil versions of
-- partitioning algorithms. thrust::tabulate has been introduced to tabulate the
-- values of functions taking integers. For 32b types, new CUDA merge and set
-- operations provide 2-15x faster performance while a new CUDA comparison sort
-- provides 1.3-4x faster performance. Finally, a new TBB reduce_by_key implementation
-- provides 80% faster performance.
-
-## Breaking Changes
-- Dispatch
-      Custom user backend systems' tag types must now inherit from the corresponding system's execution_policy template (e.g. thrust::cuda::execution_policy) instead
-      of the tag struct (e.g. thrust::cuda::tag). Otherwise, algorithm specializations will silently go unfound during dispatch.
-      See examples/minimal_custom_backend.cu and examples/cuda/fallback_allocator.cu for usage examples.
-
-      thrust::advance and thrust::distance are no longer dispatched based on iterator system type and thus may no longer be customized.
-
-- Iterators
-      iterator_facade and iterator_adaptor's Pointer template parameters have been eliminated.
-      iterator_adaptor has been moved into the thrust namespace (previously thrust::experimental::iterator_adaptor).
-      iterator_facade has been moved into the thrust namespace (previously thrust::experimental::iterator_facade).
-      iterator_core_access has been moved into the thrust namespace (previously thrust::experimental::iterator_core_access).
-      All iterators' nested pointer typedef (the type of the result of operator->) is now void instead of a pointer type to indicate that such expressions are currently impossible.
-      Floating point counting_iterators' nested difference_type typedef is now a signed integral type instead of a floating point type.
-
-- Other
-      normal_distribution has been moved into the thrust::random namespace (previously thrust::random::experimental::normal_distribution).
-      Placeholder expressions may no longer include the comma operator.
-
-## New Features
-- Execution Policies
-      Users may directly control the dispatch of algorithm invocations with optional execution policy arguments.
-      For example, instead of wrapping raw pointers allocated by cudaMalloc with thrust::device_ptr, the thrust::device execution_policy may be passed as an argument to an algorithm invocation to enable CUDA execution.
-      The following execution policies are supported in this version:
-
-        thrust::host
-        thrust::device
-        thrust::cpp::par
-        thrust::cuda::par
-        thrust::omp::par
-        thrust::tbb::par
-
-- Algorithms
-	free
-	get_temporary_buffer
-	malloc
-        merge_by_key
-        partition with stencil
-        partition_copy with stencil
-	return_temporary_buffer
-        set_difference_by_key
-        set_intersection_by_key
-        set_symmetric_difference_by_key
-        set_union_by_key
-        stable_partition with stencil
-        stable_partition_copy with stencil
-	tabulate
-
-## New Examples
-- uninitialized_vector demonstrates how to use a custom allocator to avoid the automatic initialization of elements in thrust::device_vector.
-
-## Other Enhancements
-- Authors of custom backend systems may manipulate arbitrary state during algorithm dispatch by incorporating it into their execution_policy parameter.
-- Users may control the allocation of temporary storage during algorithm execution by passing standard allocators as parameters via execution policies such as thrust::device.
-- THRUST_DEVICE_SYSTEM_CPP has been added as a compile-time target for the device backend. 
-- CUDA merge performance is 2-15x faster.
-- CUDA comparison sort performance is 1.3-4x faster.
-- CUDA set operation performance is 1.5-15x faster.
-- TBB reduce_by_key performance is 80% faster.
-- Several algorithms have been parallelized with TBB.
-- Support for user allocators in vectors has been improved.
-- The sparse_vector example is now implemented with merge_by_key instead of sort_by_key.
-- Warnings have been eliminated in various contexts.
-- Warnings about __host__ or __device__-only functions called from __host__ __device__ functions have been eliminated in various contexts.
-- Documentation about algorithm requirements have been improved.
-- Simplified the minimal_custom_backend example.
-- Simplified the cuda/custom_temporary_allocation example.
-- Simplified the cuda/fallback_allocator example.
-
-## Bug Fixes
-- #248 fix broken counting_iterator<float> behavior with OpenMP
-- #231, #209 fix set operation failures with CUDA
-- #187 fix incorrect occupancy calculation with CUDA
-- #153 fix broken multigpu behavior with CUDA
-- #142 eliminate warning produced by thrust::random::taus88 and MSVC 2010
-- #208 correctly initialize elements in temporary storage when necessary
-- #16 fix compilation error when sorting bool with CUDA
-- #10 fix ambiguous overloads of reinterpret_tag
-
-## Known Issues
-- g++ versions 4.3 and lower may fail to dispatch thrust::get_temporary_buffer correctly causing infinite recursion in examples such as cuda/custom_temporary_allocation.
-
-Acknowledgments
-- Thanks to Sean Baxter, Bryan Catanzaro, and Manjunath Kudlur for contributing a faster merge implementation for CUDA.
-- Thanks to Sean Baxter for contributing a faster set operation implementation for CUDA.
-- Thanks to Cliff Woolley for contributing a correct occupancy calculation algorithm.
-
-# Thrust 1.6.0            
-
-Summary
-- Thrust v1.6.0 provides an interface for customization and extension and a new
-- backend system based on the Threading Building Blocks library. With this
-- new interface, programmers may customize the behavior of specific algorithms
-- as well as control the allocation of temporary storage or invent entirely new
-- backends. These enhancements also allow multiple different backend systems
-- such as CUDA and OpenMP to coexist within a single program. Support for TBB
-- allows Thrust programs to integrate more naturally into applications which
-- may already employ the TBB task scheduler.
-
-## Breaking Changes
-- The header <thrust/experimental/cuda/pinned_allocator.h> has been moved to <thrust/system/cuda/experimental/pinned_allocator.h>
-- thrust::experimental::cuda::pinned_allocator has been moved to thrust::cuda::experimental::pinned_allocator
-- The macro THRUST_DEVICE_BACKEND has been renamed THRUST_DEVICE_SYSTEM
-- The macro THRUST_DEVICE_BACKEND_CUDA has been renamed THRUST_DEVICE_SYSTEM_CUDA
-- The macro THRUST_DEVICE_BACKEND_OMP has been renamed THRUST_DEVICE_SYSTEM_OMP
-- thrust::host_space_tag has been renamed thrust::host_system_tag
-- thrust::device_space_tag has been renamed thrust::device_system_tag
-- thrust::any_space_tag has been renamed thrust::any_system_tag
-- thrust::iterator_space has been renamed thrust::iterator_system
-    
-
-## New Features
-- Backend Systems
-        Threading Building Blocks (TBB) is now supported
-- Functions
-        for_each_n
-        raw_reference_cast
-- Types
-        pointer
-        reference
-
-## New Examples
-- cuda/custom_temporary_allocation
-- cuda/fallback_allocator
-- device_ptr
-- expand
-- minimal_custom_backend
-- raw_reference_cast
-- set_operations
-
-## Other Enhancements
-- thrust::for_each now returns the end of the input range similar to most other algorithms
-- thrust::pair and thrust::tuple have swap functionality
-- all CUDA algorithms now support large data types
-- iterators may be dereferenced in user __device__ or __global__ functions
-- the safe use of different backend systems is now possible within a single binary
-
-## Bug Fixes
-- #469 min_element and max_element algorithms no longer require a const comparison operator
-
-## Known Issues
-- cudafe++.exe may crash when parsing TBB headers on Windows. 
-
-# Thrust 1.5.3 (CUDA 5.0)      
-
-Summary
-- Small bug fixes
-
-## Bug Fixes
-- Avoid warnings about potential race due to __shared__ non-POD variable
-
-# Thrust 1.5.2 (CUDA 4.2)      
-
-Summary
-- Small bug fixes
-
-## Bug Fixes
-- Fixed warning about C-style initialization of structures
-
-# Thrust 1.5.1 (CUDA 4.1)      
-
-Summary
-- Small bug fixes
-
-## Bug Fixes
-- Sorting data referenced by permutation_iterators on CUDA produces invalid results
-
-# Thrust 1.5.0            
-
-Summary
-- Thrust v1.5.0 provides introduces new programmer productivity and performance
-- enhancements. New functionality for creating anonymous "lambda" functions has
-- been added. A faster host sort provides 2-10x faster performance for sorting
-- arithmetic types on (single-threaded) CPUs. A new OpenMP sort provides
-- 2.5x-3.0x speedup over the host sort using a quad-core CPU. When sorting
-- arithmetic types with the OpenMP backend the combined performance improvement
-- is 5.9x for 32-bit integers and ranges from 3.0x (64-bit types) to 14.2x
-- (8-bit types). A new CUDA reduce_by_key implementation provides 2-3x faster
-- performance.
-
-## Breaking Changes
-- device_ptr<void> no longer unsafely converts to device_ptr<T> without an
-- explicit cast. Use the expression
-- device_pointer_cast(static_cast<int*>(void_ptr.get()))
-- to convert, for example, device_ptr<void> to device_ptr<int>.
-
-## New Features
-- Functions
-        stencil-less transform_if
-
-- Types
-        lambda placeholders
-
-## New Examples
-- lambda
-
-## Other Enhancements
-- host sort is 2-10x faster for arithmetic types
-- OMP sort provides speedup over host sort
-- reduce_by_key is 2-3x faster
-- reduce_by_key no longer requires O(N) temporary storage
-- CUDA scan algorithms are 10-40% faster
-- host_vector and device_vector are now documented
-- out-of-memory exceptions now provide detailed information from CUDART
-- improved histogram example
-- device_reference now has a specialized swap
-- reduce_by_key and scan algorithms are compatible with discard_iterator
-
-Removed Functionality
-
-## Bug Fixes
-     #44 allow host_vector to compile when value_type uses __align__
-- #198 allow adjacent_difference to permit safe in-situ operation
-- #303 make thrust thread-safe
-- #313 avoid race conditions in device_vector::insert
-- #314 avoid unintended adl invocation when dispatching copy
-- #365 fix merge and set operation failures
-
-## Known Issues
-- None
-
-Acknowledgments
-- Thanks to Manjunath Kudlur for contributing his Carbon library, from which the lambda functionality is derived.
-- Thanks to Jean-Francois Bastien for suggesting a fix for issue 303.
-
-# Thrust 1.4.0 (CUDA 4.0)      
-
-Summary
-- Thrust v1.4.0 provides support for CUDA 4.0 in addition to many feature
-- and performance improvements.  New set theoretic algorithms operating on
-- sorted sequences have been added.  Additionally, a new fancy iterator
-- allows discarding redundant or otherwise unnecessary output from
-- algorithms, conserving memory storage and bandwidth.
-
-## Breaking Changes
-- Eliminations
-        thrust/is_sorted.h
-        thrust/utility.h
-        thrust/set_intersection.h
-        thrust/experimental/cuda/ogl_interop_allocator.h and the functionality therein
-        thrust::deprecated::copy_when
-        thrust::deprecated::absolute_value
-
-## New Features
-- Functions
-        copy_n
-        merge
-        set_difference
-        set_symmetric_difference
-        set_union
-
-- Types
-        discard_iterator
-
-- Device support
-        Compute Capability 2.1 GPUs
-
-## New Examples
-- run_length_decoding
-
-## Other Enhancements
-- Compilation warnings are substantially reduced in various contexts.
-- The compilation time of thrust::sort, thrust::stable_sort, thrust::sort_by_key,
-- and thrust::stable_sort_by_key are substantially reduced.
-- A fast sort implementation is used when sorting primitive types with thrust::greater.
-- The performance of thrust::set_intersection is improved.
-- The performance of thrust::fill is improved on SM 1.x devices.
-- A code example is now provided in each algorithm's documentation.
-- thrust::reverse now operates in-place
-
-Removed Functionality
-- thrust::deprecated::copy_when
-- thrust::deprecated::absolute_value
-- thrust::experimental::cuda::ogl_interop_allocator
-- thrust::gather and thrust::scatter from host to device and vice versa are no longer supported.
-- Operations which modify the elements of a thrust::device_vector are no longer
-- available from source code compiled without nvcc when the device backend is CUDA.
-- Instead, use the idiom from the cpp_interop example.
-
-## Bug Fixes
-- #212 set_intersection works correctly for large input sizes.
-- #275 counting_iterator and constant_iterator work correctly with OpenMP as the
-- backend when compiling with optimization
-- #256 min and max correctly return their first argument as a tie-breaker
-- #248 NDEBUG is interpreted correctly
-
-## Known Issues
-- nvcc may generate code containing warnings when compiling some Thrust algorithms.
-- When compiling with -arch=sm_1x, some Thrust algorithms may cause nvcc to issue
-- benign pointer advisories.
-- When compiling with -arch=sm_1x and -G, some Thrust algorithms may fail to execute correctly.
-- thrust::inclusive_scan, thrust::exclusive_scan, thrust::inclusive_scan_by_key,
-- and thrust::exclusive_scan_by_key are currently incompatible with thrust::discard_iterator.
-
-Acknowledgments
-- Thanks to David Tarjan for improving the performance of set_intersection.
-- Thanks to Duane Merrill for continued help with sort.
-- Thanks to Nathan Whitehead for help with CUDA Toolkit integration.
-
-# Thrust 1.3.0 (CUDA 3.2)      
-
-Summary
-- Thrust v1.3.0 provides support for CUDA 3.2 in addition to many feature
-- and performance enhancements.
-    
-- Performance of the sort and sort_by_key algorithms is improved by as much 
-- as 3x in certain situations.  The performance of stream compaction algorithms,
-- such as copy_if, is improved by as much as 2x.  Reduction performance is 
-- also improved, particularly for small input sizes.
-    
-- CUDA errors are now converted to runtime exceptions using the system_error
-- interface.  Combined with a debug mode, also new in v1.3, runtime errors
-- can be located with greater precision.
-
-- Lastly, a few header files have been consolidated or renamed for clarity.
-- See the deprecations section below for additional details.
-
-
-## Breaking Changes
-- Promotions
-        thrust::experimental::inclusive_segmented_scan has been renamed thrust::inclusive_scan_by_key and exposes a different interface
-        thrust::experimental::exclusive_segmented_scan has been renamed thrust::exclusive_scan_by_key and exposes a different interface
-        thrust::experimental::partition_copy has been renamed thrust::partition_copy and exposes a different interface
-        thrust::next::gather has been renamed thrust::gather
-        thrust::next::gather_if has been renamed thrust::gather_if
-        thrust::unique_copy_by_key has been renamed thrust::unique_by_key_copy
-- Deprecations
-        thrust::copy_when has been renamed thrust::deprecated::copy_when
-        thrust::absolute_value has been renamed thrust::deprecated::absolute_value
-        The header thrust/set_intersection.h is now deprecated; use thrust/set_operations.h instead
-        The header thrust/utility.h is now deprecated; use thrust/swap.h instead
-        The header thrust/swap_ranges.h is now deprecated; use thrust/swap.h instead
-- Eliminations
-        thrust::deprecated::gather
-        thrust::deprecated::gather_if
-        thrust/experimental/arch.h and the functions therein
-        thrust/sorting/merge_sort.h
-        thrust/sorting/radix_sort.h
-
-## New Features
-- Functions
-        exclusive_scan_by_key
-        find
-        find_if
-        find_if_not
-        inclusive_scan_by_key
-        is_partitioned
-        is_sorted_until
-        mismatch
-        partition_point
-        reverse
-        reverse_copy
-        stable_partition_copy
-
-- Types
-        system_error and related types
-        experimental::cuda::ogl_interop_allocator
-        bit_and, bit_or, and bit_xor
-
-- Device support
-        gf104-based GPUs
-
-## New Examples
-- opengl_interop.cu
-- repeated_range.cu
-- simple_moving_average.cu
-- sparse_vector.cu
-- strided_range.cu
-
-## Other Enhancements
-- Performance of thrust::sort and thrust::sort_by_key is substantially improved for primitive key types
-- Performance of thrust::copy_if is substantially improved
-- Performance of thrust::reduce and related reductions is improved
-- THRUST_DEBUG mode added
-- Callers of Thrust functions may detect error conditions by catching thrust::system_error, which derives from std::runtime_error
-- The number of compiler warnings generated by Thrust has been substantially reduced
-- Comparison sort now works correctly for input sizes > 32M
-- min & max usage no longer collides with <windows.h> definitions
-- Compiling against the OpenMP backend no longer requires nvcc
-- Performance of device_vector initialized in .cpp files is substantially improved in common cases
-- Performance of thrust::sort_by_key on the host is substantially improved
-
-Removed Functionality
-- nvcc 2.3 is no longer supported
-
-## Bug Fixes
-- Debug device code now compiles correctly
-- thrust::uninitialized_copy and thrust::unintialized_fill now dispatch constructors on the device rather than the host
-
-## Known Issues
-- #212 set_intersection is known to fail for large input sizes
-- partition_point is known to fail for 64b types with nvcc 3.2
-
-Acknowledgments
-- Thanks to Duane Merrill for contributing a fast CUDA radix sort implementation
-- Thanks to Erich Elsen for contributing an implementation of find_if
-- Thanks to Andrew Corrigan for contributing changes which allow the OpenMP backend to compile in the absence of nvcc
-- Thanks to Andrew Corrigan, Cliff Wooley, David Coeurjolly, Janick Martinez Esturo, John Bowers, Maxim Naumov, Michael Garland, and Ryuta Suzuki for bug reports
-- Thanks to Cliff Woolley for help with testing
-
-# Thrust 1.2.1 (CUDA 3.1)      
-
-Summary
-- Small fixes for compatibility with CUDA 3.1
-
-## Known Issues
-- inclusive_scan & exclusive_scan may fail with very large types
-- the Microsoft compiler may fail to compile code using both sort and binary search algorithms
-- uninitialized_fill & uninitialized_copy dispatch constructors on the host rather than the device
-- # 109 some algorithms may exhibit poor performance with the OpenMP backend with large numbers (>= 6) of CPU threads
-- default_random_engine::discard is not accelerated with nvcc 2.3
-- nvcc 3.1 may fail to compile code using types derived from thrust::subtract_with_carry_engine, such as thrust::ranlux24 & thrust::ranlux48.
-
-# Thrust 1.2.0            
-
-Summary
-- Thrust v1.2 introduces support for compilation to multicore CPUs
-- and the Ocelot virtual machine, and several new facilities for
-- pseudo-random number generation.  New algorithms such as set
-- intersection and segmented reduction have also been added.  Lastly,
-- improvements to the robustness of the CUDA backend ensure
-- correctness across a broad set of (uncommon) use cases.
-
-## Breaking Changes
-- thrust::gather's interface was incorrect and has been removed.
-- The old interface is deprecated but will be preserved for Thrust
-- version 1.2 at thrust::deprecated::gather &
-- thrust::deprecated::gather_if. The new interface is provided at
-- thrust::next::gather & thrust::next::gather_if.  The new interface
-- will be promoted to thrust:: in Thrust version 1.3. For more details,
-- please refer to this thread:
-- http://groups.google.com/group/thrust-users/browse_thread/thread/f5f0583cb97b51fd
-
-- The thrust::sorting namespace has been deprecated in favor of the
-- top-level sorting functions, such as thrust::sort() and
-- thrust::sort_by_key().
-
-## New Features
-- Functions
-        reduce_by_key
-        set_intersection
-        tie
-        unique_copy
-        unique_by_key
-        unique_copy_by_key
-
-- Types
-        Random Number Generation
-            discard_block_engine
-            default_random_engine
-            linear_congruential_engine
-            linear_feedback_shift_engine
-            minstd_rand
-            minstd_rand0
-            normal_distribution (experimental)
-            ranlux24
-            ranlux48
-            ranlux24_base
-            ranlux48_base
-            subtract_with_carry_engine
-            taus88
-            uniform_int_distribution
-            uniform_real_distribution
-            xor_combine_engine
-        Functionals
-            project1st
-            project2nd
-
-- Fancy Iterators
-        permutation_iterator
-        reverse_iterator
-
-- Device support
-        Add support for multicore CPUs via OpenMP
-        Add support for Fermi-class GPUs
-        Add support for Ocelot virtual machine
-
-## New Examples
-- cpp_integration
-- histogram
-- mode
-- monte_carlo
-- monte_carlo_disjoint_sequences
-- padded_grid_reduction
-- permutation_iterator
-- row_sum
-- run_length_encoding
-- segmented_scan
-- stream_compaction
-- summary_statistics
-- transform_iterator
-- word_count
-
-## Other Enhancements
-- vector functions operator!=, rbegin, crbegin, rend, crend, data, & shrink_to_fit
-- integer sorting performance is improved when max is large but (max - min) is small and when min is negative
-- performance of inclusive_scan() and exclusive_scan() is improved by 20-25% for primitive types
-- support for nvcc 3.0
-
-Removed Functionality
-- removed support for equal between host & device sequences
-- removed support for gather() and scatter() between host & device sequences
-
-## Bug Fixes
-- # 8 cause a compiler error if the required compiler is not found rather than a mysterious error at link time
-- # 42 device_ptr & device_reference are classes rather than structs, eliminating warnings on certain platforms
-- # 46 gather & scatter handle any space iterators correctly
-- # 51 thrust::experimental::arch functions gracefully handle unrecognized GPUs
-- # 52 avoid collisions with common user macros such as BLOCK_SIZE
-- # 62 provide better documentation for device_reference
-- # 68 allow built-in CUDA vector types to work with device_vector in pure C++ mode
-- # 102 eliminated a race condition in device_vector::erase
-- various compilation warnings eliminated
-
-## Known Issues
-   inclusive_scan & exclusive_scan may fail with very large types
-   the Microsoft compiler may fail to compile code using both sort and binary search algorithms
-   uninitialized_fill & uninitialized_copy dispatch constructors on the host rather than the device
-   # 109 some algorithms may exhibit poor performance with the OpenMP backend with large numbers (>= 6) of CPU threads
-   default_random_engine::discard is not accelerated with nvcc 2.3
-
-Acknowledgments
-   Thanks to Gregory Diamos for contributing a CUDA implementation of set_intersection
-   Thanks to Ryuta Suzuki & Gregory Diamos for rigorously testing Thrust's unit tests and examples against Ocelot
-   Thanks to Tom Bradley for contributing an implementation of normal_distribution
-   Thanks to Joseph Rhoads for contributing the example summary_statistics
-
-# Thrust 1.1.1            
-
-Summary
-- Small fixes for compatibility with CUDA 2.3a and Mac OSX Snow Leopard.
-
-# Thrust 1.1.0            
-
-Summary
-- Thrust v1.1 introduces fancy iterators, binary search functions, and
-- several specialized reduction functions.  Experimental support for
-- segmented scan has also been added.
-
-## Breaking Changes
-- counting_iterator has been moved into the thrust namespace (previously thrust::experimental)
-
-## New Features
-- Functions
-        copy_if
-        lower_bound
-        upper_bound
-        vectorized lower_bound
-        vectorized upper_bound
-        equal_range
-        binary_search
-        vectorized binary_search
-        all_of
-        any_of
-        none_of
-        minmax_element
-        advance
-        inclusive_segmented_scan (experimental)
-        exclusive_segmented_scan (experimental)
-
-- Types
-        pair
-        tuple
-        device_malloc_allocator
-
-- Fancy Iterators
-        constant_iterator
-        counting_iterator
-        transform_iterator
-        zip_iterator
-
-## New Examples
-- computing the maximum absolute difference between vectors
-- computing the bounding box of a two-dimensional point set
-- sorting multiple arrays together (lexicographical sorting)
-- constructing a summed area table
-- using zip_iterator to mimic an array of structs
-- using constant_iterator to increment array values
-
-## Other Enhancements
-- added pinned memory allocator (experimental)
-- added more methods to host_vector & device_vector (issue #4)
-- added variant of remove_if with a stencil argument (issue #29)
-- scan and reduce use cudaFuncGetAttributes to determine grid size
-- exceptions are reported when temporary device arrays cannot be allocated 
-
-## Bug Fixes
-     #5 make vector work for larger data types
-     #9 stable_partition_copy doesn't respect OutputIterator concept semantics
-- #10 scans should return OutputIterator
-- #16 make algorithms work for larger data types
-- #27 dispatch radix_sort even when comp=less<T> is explicitly provided
-
-## Known Issues
-- Using functors with Thrust entry points may not compile on Mac OSX with gcc
-    4.0.1.
-- `thrust::uninitialized_copy` and `thrust::uninitialized_fill` dispatch
-    constructors on the host rather than the device.
-- `thrust::inclusive_scan`, `thrust::inclusive_scan_by_key`,
-    `thrust::exclusive_scan`, and `thrust::exclusive_scan_by_key` may fail when
-    used with large types with the CUDA 3.1 driver.
-
-# Thrust 1.0.0            
-
-## Breaking Changes
-- Rename top level namespace `komrade` to `thrust`.
-- Move `thrust::partition_copy` & `thrust::stable_partition_copy` into
-    `thrust::experimental` namespace until we can easily provide the standard
-    interface.
-- Rename `thrust::range` to `thrust::sequence` to avoid collision with
-    Boost.Range.
-- Rename `thrust::copy_if` to `thrust::copy_when` due to semantic differences
-    with C++0x copy_if().
-
-## New Features
-- Add C++0x style `cbegin` & `cend` methods to `thrust::host_vector` and
-    `thrust::device_vector`.
-- Add `thrust::transform_if` function.
-- Add stencil versions of `thrust::replace_if` & `thrust::replace_copy_if`.
-- Allow `counting_iterator` to work with `thrust::for_each`.
-- Allow types with constructors in comparison `thrust::sort` and
-    `thrust::reduce`.
-
-## Other Enhancements
-- `thrust::merge_sort` and `thrust::stable_merge_sort` are now 2x to 5x faster
-    when executed on the parallel device.
-
-## Bug Fixes
-- Komrade 6: Workaround an issue where an incremented iterator causes NVCC to
-    crash.
-- Komrade 7: Fix an issue where `const_iterator`s could not be passed to
-    `thrust::transform`.
-
diff --git a/doc/thrust_logo.png b/doc/thrust_logo.png
deleted file mode 100644
index 123794b6a..000000000
Binary files a/doc/thrust_logo.png and /dev/null differ
diff --git a/doc/thrust_logo.svg b/doc/thrust_logo.svg
deleted file mode 100644
index 4fd82acaf..000000000
--- a/doc/thrust_logo.svg
+++ /dev/null
@@ -1,272 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<!-- Created with Inkscape (http://www.inkscape.org/) -->
-<svg
-   xmlns:dc="http://purl.org/dc/elements/1.1/"
-   xmlns:cc="http://creativecommons.org/ns#"
-   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
-   xmlns:svg="http://www.w3.org/2000/svg"
-   xmlns="http://www.w3.org/2000/svg"
-   xmlns:xlink="http://www.w3.org/1999/xlink"
-   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
-   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
-   width="1052.3622"
-   height="744.09448"
-   id="svg2"
-   sodipodi:version="0.32"
-   inkscape:version="0.46"
-   version="1.0"
-   sodipodi:docname="thrust_logo.svg"
-   inkscape:output_extension="org.inkscape.output.svg.inkscape"
-   inkscape:export-filename="/home/nathan/Desktop/Old/logos/thrust3svg.jpg.png"
-   inkscape:export-xdpi="90"
-   inkscape:export-ydpi="90">
-  <defs
-     id="defs4">
-    <linearGradient
-       id="linearGradient5922">
-      <stop
-         style="stop-color:#b3b3b3;stop-opacity:1;"
-         offset="0"
-         id="stop5924" />
-      <stop
-         style="stop-color:#b3b3b3;stop-opacity:0;"
-         offset="1"
-         id="stop5926" />
-    </linearGradient>
-    <linearGradient
-       id="linearGradient5886">
-      <stop
-         id="stop5888"
-         offset="0"
-         style="stop-color:#666666;stop-opacity:1;" />
-      <stop
-         style="stop-color:#e3e3e3;stop-opacity:1;"
-         offset="0.47389936"
-         id="stop5890" />
-      <stop
-         id="stop5892"
-         offset="1"
-         style="stop-color:#666666;stop-opacity:1;" />
-    </linearGradient>
-    <linearGradient
-       id="linearGradient5840">
-      <stop
-         id="stop5842"
-         offset="0"
-         style="stop-color:#1a1a1a;stop-opacity:1;" />
-      <stop
-         style="stop-color:#cbcbcb;stop-opacity:1;"
-         offset="0.42692322"
-         id="stop5844" />
-      <stop
-         id="stop5846"
-         offset="1"
-         style="stop-color:#252525;stop-opacity:1;" />
-    </linearGradient>
-    <linearGradient
-       id="linearGradient5795">
-      <stop
-         style="stop-color:#666666;stop-opacity:1;"
-         offset="0"
-         id="stop5797" />
-      <stop
-         id="stop5805"
-         offset="0.36170211"
-         style="stop-color:#e3e3e3;stop-opacity:1;" />
-      <stop
-         style="stop-color:#666666;stop-opacity:1;"
-         offset="1"
-         id="stop5799" />
-    </linearGradient>
-    <linearGradient
-       id="linearGradient5773">
-      <stop
-         style="stop-color:#3b3b3b;stop-opacity:1;"
-         offset="0"
-         id="stop5775" />
-      <stop
-         id="stop5781"
-         offset="0.4955157"
-         style="stop-color:#ececec;stop-opacity:0.49803922;" />
-      <stop
-         style="stop-color:#000000;stop-opacity:0;"
-         offset="1"
-         id="stop5777" />
-    </linearGradient>
-    <linearGradient
-       id="linearGradient5743">
-      <stop
-         style="stop-color:#626161;stop-opacity:1;"
-         offset="0"
-         id="stop5745" />
-      <stop
-         id="stop5753"
-         offset="0.44680852"
-         style="stop-color:#161882;stop-opacity:0.49803922;" />
-      <stop
-         style="stop-color:#00bb00;stop-opacity:0;"
-         offset="1"
-         id="stop5747" />
-    </linearGradient>
-    <linearGradient
-       id="linearGradient3213">
-      <stop
-         style="stop-color:#000000;stop-opacity:1;"
-         offset="0"
-         id="stop3215" />
-      <stop
-         style="stop-color:#a7a7a7;stop-opacity:0;"
-         offset="1"
-         id="stop3217" />
-    </linearGradient>
-    <inkscape:perspective
-       sodipodi:type="inkscape:persp3d"
-       inkscape:vp_x="0 : 526.18109 : 1"
-       inkscape:vp_y="0 : 1000 : 0"
-       inkscape:vp_z="744.09448 : 526.18109 : 1"
-       inkscape:persp3d-origin="372.04724 : 350.78739 : 1"
-       id="perspective10" />
-    <linearGradient
-       inkscape:collect="always"
-       xlink:href="#linearGradient5795"
-       id="linearGradient5810"
-       gradientUnits="userSpaceOnUse"
-       gradientTransform="matrix(1.0247944,0,0,0.7176622,-1246.1936,214.03097)"
-       x1="771.13623"
-       y1="-287.25806"
-       x2="1120.5692"
-       y2="201.83484" />
-    <linearGradient
-       inkscape:collect="always"
-       xlink:href="#linearGradient5795"
-       id="linearGradient5824"
-       gradientUnits="userSpaceOnUse"
-       gradientTransform="matrix(1.0247944,0,0,0.7176622,-1227.724,586.99847)"
-       x1="771.13623"
-       y1="-287.25806"
-       x2="663.33466"
-       y2="-144.52788" />
-    <linearGradient
-       inkscape:collect="always"
-       xlink:href="#linearGradient5840"
-       id="linearGradient5838"
-       gradientUnits="userSpaceOnUse"
-       gradientTransform="matrix(1.0247944,0,0,0.7176622,-1246.1936,214.03097)"
-       x1="771.13623"
-       y1="-287.25806"
-       x2="1137.2974"
-       y2="174.0116" />
-  </defs>
-  <sodipodi:namedview
-     id="base"
-     pagecolor="#ffffff"
-     bordercolor="#666666"
-     borderopacity="1.0"
-     gridtolerance="10000"
-     guidetolerance="10"
-     objecttolerance="10"
-     inkscape:pageopacity="0.0"
-     inkscape:pageshadow="2"
-     inkscape:zoom="1"
-     inkscape:cx="513.86573"
-     inkscape:cy="372.04724"
-     inkscape:document-units="px"
-     inkscape:current-layer="layer1"
-     showgrid="false"
-     inkscape:window-width="1920"
-     inkscape:window-height="1125"
-     inkscape:window-x="0"
-     inkscape:window-y="25" />
-  <metadata
-     id="metadata7">
-    <rdf:RDF>
-      <cc:Work
-         rdf:about="">
-        <dc:format>image/svg+xml</dc:format>
-        <dc:type
-           rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
-      </cc:Work>
-    </rdf:RDF>
-  </metadata>
-  <g
-     inkscape:label="Layer 1"
-     inkscape:groupmode="layer"
-     id="layer1">
-    <g
-       id="g3189"
-       inkscape:export-filename="/home/nathan/NV/thrust/doc/thrust_logo.png"
-       inkscape:export-xdpi="47.029999"
-       inkscape:export-ydpi="47.029999">
-      <path
-         d="M 256.90625,209.34375 C 245.27561,209.38319 234.38709,213.94209 226.03125,221.0625 C 216.48171,229.20011 209.59283,242.94767 214.65625,256.65625 L 288.125,455.5625 C 291.48237,464.65215 295.87551,473.99003 303.21875,481.625 C 310.56199,489.25997 321.45303,494.71875 334.15625,494.71875 L 805.34375,494.71875 C 817.97624,494.71876 828.98878,489.54948 836.625,481.90625 C 844.26122,474.26302 848.88495,464.56763 851.65625,454.6875 L 889.5,319.75 C 893.24724,306.39046 886.23452,293.51892 877,286.21875 C 867.76548,278.91858 856.12028,274.84557 844.4375,273.5625 L 261.9375,209.59375 C 260.25138,209.40857 258.56777,209.33812 256.90625,209.34375 z"
-         inkscape:href="#rect2474"
-         id="path3265"
-         style="fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:2.10967277999999991;stroke-linejoin:round;stroke-miterlimit:4;stroke-opacity:1"
-         xlink:href="#rect2474"
-         inkscape:original="M 258.6875 221.03125 C 239.30554 218.90262 217.29031 236.04476 223.4375 252.6875 L 296.90625 451.59375 C 303.05344 468.2365 312.62987 483.21875 332.15625 483.21875 L 803.34375 483.21875 C 822.87016 483.21876 833.82448 468.59699 838.59375 451.59375 L 876.4375 316.65625 C 881.20677 299.65302 860.56946 287.12863 841.1875 285 L 258.6875 221.03125 z "
-         inkscape:radius="11.495221"
-         sodipodi:type="inkscape:offset" />
-      <path
-         sodipodi:nodetypes="czzzzzzzz"
-         id="rect2474"
-         d="M 841.1984,285.00037 L 258.69824,221.02711 C 239.31628,218.89848 217.30488,236.03474 223.45207,252.67748 L 296.91964,451.58125 C 303.06684,468.22399 312.63943,483.23161 332.16581,483.23161 L 803.35147,483.23161 C 822.87785,483.23161 833.82838,468.58449 838.59765,451.58125 L 876.44458,316.65074 C 881.21385,299.6475 860.58036,287.129 841.1984,285.00037 z"
-         style="fill:#66b366;fill-opacity:1;stroke:#000000;stroke-width:2.10967277999999991;stroke-linejoin:round;stroke-miterlimit:4;stroke-opacity:1" />
-    </g>
-    <g
-       id="g3251"
-       transform="matrix(0.913744,0,0,0.3451662,176.2736,220.85042)"
-       style="opacity:1"
-       inkscape:export-filename="/home/nathan/NV/thrust/doc/thrust_logo.png"
-       inkscape:export-xdpi="47.029999"
-       inkscape:export-ydpi="47.029999">
-      <g
-         id="g3253"
-         transform="matrix(2.0484578,-1.263301,0.1197948,2.5356515,-182.46458,-362.9203)">
-        <path
-           sodipodi:type="inkscape:offset"
-           inkscape:radius="5.4485359"
-           inkscape:original="M 291.6875 279 C 206.19469 277.76693 90.813927 330.28055 44.5625 378.59375 C 119.00866 442.66663 390.60576 547.17687 393.5 375.5625 C 394.67595 305.83429 350.18258 279.84368 291.6875 279 z "
-           xlink:href="#path3255"
-           style="fill:#666666;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
-           id="path3263"
-           inkscape:href="#path3255"
-           d="M 291.78125,273.5625 C 247.88427,272.92937 197.14434,285.95647 151.3125,305.1875 C 105.48066,324.41853 64.633863,349.73338 40.625,374.8125 C 39.587603,375.89202 39.04008,377.35083 39.111013,378.84633 C 39.181946,380.34183 39.865085,381.74226 41,382.71875 C 79.595929,415.93675 166.14169,457.95278 244.96875,470.84375 C 284.38228,477.28923 321.94436,476.49105 350.625,462.34375 C 379.30564,448.19645 398.18956,420.0057 398.9375,375.65625 C 399.5452,339.62233 388.08647,313.71403 368.46875,297.28125 C 348.85103,280.84847 321.81559,273.99569 291.78125,273.5625 z" />
-        <path
-           style="fill:#ffee00;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
-           d="M 393.50906,375.56396 C 396.40371,203.9253 122.46857,297.21173 44.57143,378.58133 C 119.01759,442.65421 390.61482,547.17833 393.50906,375.56396 z"
-           id="path3255"
-           sodipodi:nodetypes="ccz" />
-        <path
-           style="fill:#ffb500;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
-           d="M 385.4286,375.1448 C 388.01423,252.50309 143.32293,319.15945 73.741661,377.30082 C 140.24036,423.0831 382.84333,497.76917 385.4286,375.1448 z"
-           id="path3257"
-           sodipodi:nodetypes="ccz" />
-        <path
-           style="fill:#ff6c00;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
-           d="M 382.08135,375.00509 C 383.96651,268.69569 205.56124,326.47536 154.8293,376.87398 C 203.31374,416.55939 380.19638,481.29945 382.08135,375.00509 z"
-           id="path3259"
-           sodipodi:nodetypes="ccz" />
-        <path
-           style="fill:#e42800;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
-           d="M 378.29864,374.84209 C 379.58638,287.58705 257.71919,335.01058 223.06461,376.37601 C 256.18393,408.9484 377.01103,462.08477 378.29864,374.84209 z"
-           id="path3261"
-           sodipodi:nodetypes="ccz" />
-      </g>
-    </g>
-    <text
-       xml:space="preserve"
-       style="font-size:178.33847046px;font-style:italic;font-variant:normal;font-weight:bold;font-stretch:normal;opacity:1;fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:1.99999785;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1;font-family:UnDotum;-inkscape-font-specification:UnDotum Bold Italic"
-       x="352.8208"
-       y="466.72366"
-       id="text3247"
-       transform="matrix(1.0688669,0,-0.2132749,0.9355701,0,0)"
-       inkscape:export-filename="/home/nathan/NV/thrust/doc/thrust_logo.png"
-       inkscape:export-xdpi="47.029999"
-       inkscape:export-ydpi="47.029999"><tspan
-         sodipodi:role="line"
-         id="tspan3249"
-         x="352.8208"
-         y="466.72366"
-         style="font-size:178.33847046px;font-style:italic;font-variant:normal;font-weight:bold;font-stretch:normal;fill:#ffffff;stroke:#000000;stroke-width:1.99999785;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1;font-family:UnDotum;-inkscape-font-specification:UnDotum Bold Italic">Thrust</tspan></text>
-  </g>
-</svg>
diff --git a/docs/doxybook/config.json b/docs/doxybook/config.json
new file mode 100644
index 000000000..56b7a238b
--- /dev/null
+++ b/docs/doxybook/config.json
@@ -0,0 +1,49 @@
+{
+  "baseUrl": "{{ site.baseurl }}/api/",
+  "copyImages": true,
+  "fileExt": "md",
+  "filesFilter": [],
+  "folderClassesName": "classes",
+  "folderExamplesName": "examples",
+  "folderFilesName": "files",
+  "folderGroupsName": "groups",
+  "folderNamespacesName": "namespaces",
+  "folderRelatedPagesName": "pages",
+  "imagesFolder": "images",
+  "indexClassesName": "index_classes",
+  "indexClassesTitle": "Classes",
+  "indexExamplesName": "index_examples",
+  "indexExamplesTitle": "Examples",
+  "indexFilesName": "index_files",
+  "indexFilesTitle": "Files",
+  "indexGroupsName": "index_groups",
+  "indexGroupsTitle": "Groups",
+  "indexInFolders": false,
+  "indexNamespacesName": "index_namespaces",
+  "indexNamespacesTitle": "namespaces",
+  "indexRelatedPagesName": "index_pages",
+  "indexRelatedPagesTitle": "pages",
+  "linkLowercase": true,
+  "linkAndInlineCodeAsHTML": true,
+  "linkSuffix": ".html",
+  "mainPageInRoot": false,
+  "mainPageName": "indexpage",
+  "sort": false,
+  "templateIndexClasses": "index_classes",
+  "templateIndexExamples": "index_examples",
+  "templateIndexFiles": "index_files",
+  "templateIndexGroups": "index_groups",
+  "templateIndexNamespaces": "index_namespaces",
+  "templateIndexRelatedPages": "index_pages",
+  "templateKindClass": "kind_class",
+  "templateKindDir": "kind_file",
+  "templateKindExample": "kind_page",
+  "templateKindFile": "kind_file",
+  "templateKindGroup": "kind_nonclass",
+  "templateKindInterface": "kind_class",
+  "templateKindNamespace": "kind_nonclass",
+  "templateKindPage": "kind_page",
+  "templateKindStruct": "kind_class",
+  "templateKindUnion": "kind_class",
+  "useFolders": true
+}
diff --git a/docs/doxybook/templates/class_members.tmpl b/docs/doxybook/templates/class_members.tmpl
new file mode 100644
index 000000000..cb5f65f38
--- /dev/null
+++ b/docs/doxybook/templates/class_members.tmpl
@@ -0,0 +1,210 @@
+{%- if exists("publicClasses") or exists("publicTypes") or exists("publicAttributes") or exists("publicFunctions") or exists("friends") -%}
+  {%- set has_public_members = true -%}
+{%- endif -%}
+{%- if exists("protectedClasses") or exists("protectedTypes") or exists("protectedAttributes") or exists("protectedFunctions") -%}
+  {%- set has_protected_members = true -%}
+{%- endif -%}
+{%- if exists("baseClasses") -%}
+  {%- for base in baseClasses -%}
+    {%- if existsIn(base, "publicClasses") or existsIn(base, "publicTypes") or existsIn(base, "publicAttributes") or existsIn(base, "publicFunctions") or existsIn(base, "friends") -%}
+      {%- set has_public_members = true -%}
+    {%- endif -%}
+    {%- if existsIn(base, "protectedClasses") or existsIn(base, "protectedTypes") or existsIn(base, "protectedAttributes") or existsIn(base, "protectedFunctions") -%}
+      {%- set has_protected_members = true -%}
+    {%- endif -%}
+  {%- endfor -%}
+{%- endif -%}
+<code class="doxybook">
+{%- if exists("includes") -%}
+  <span>#include {{includes}}</span>{{ noop() -}}
+  <br>
+{%- endif -%}
+{%- include "synopsis_template_parameters.tmpl" -%}
+<span>{%- include "synopsis_kind.tmpl" -%}{% include "name_qualified.tmpl" %} {</span>
+{%- set synopsis_indent_width = 2 -%}
+{%- set names_qualified = false -%}
+{%- if default(has_public_members, false) -%}
+  <span>public:</span>{{- noop() -}}
+{%- endif -%}
+{%- if exists("publicTypes") -%}
+  {%- for child in publicTypes -%}
+    {%- include "synopsis_type.tmpl" -%}
+    {%- set synopsis_needs_leading_line_break = true -%}
+  {%- endfor -%}
+{%- endif -%}
+{%- if exists("baseClasses") -%}{%- for base in baseClasses -%}
+  {%- if existsIn(base, "publicTypes") -%}
+    {%- for child in base.publicTypes -%}
+      {%- set synopsis_is_inherited = true -%}
+      {%- include "synopsis_type.tmpl" -%}
+      {%- set synopsis_is_inherited = false -%}
+      {%- set synopsis_needs_leading_line_break = true -%}
+    {%- endfor -%}
+  {%- endif -%}
+{%- endfor -%}{%- endif -%}
+{%- if exists("publicClasses") -%}
+  {%- for child in publicClasses -%}
+    {%- include "synopsis_class.tmpl" -%}
+    {%- set synopsis_needs_leading_line_break = true -%}
+  {%- endfor -%}
+{%- endif -%}
+{%- if exists("baseClasses") -%}{%- for base in baseClasses -%}
+  {%- if existsIn(base, "publicClasses") -%}
+    {%- for child in base.publicClasses -%}
+      {%- set synopsis_is_inherited = true -%}
+      {%- include "synopsis_class.tmpl" -%}
+      {%- set synopsis_is_inherited = false -%}
+      {%- set synopsis_needs_leading_line_break = true -%}
+    {%- endfor -%}
+  {%- endif -%}
+{%- endfor -%}{%- endif -%}
+{%- if exists("friends") -%}
+  {%- for child in friends -%}
+    {%- if child.type == "class" or child.type == "struct" -%}
+      {%- include "synopsis_friend_class.tmpl" -%}
+      {%- set synopsis_needs_leading_line_break = true -%}
+    {%- endif -%}
+  {%- endfor -%}
+{%- endif -%}
+{%- if exists("baseClasses") -%}{%- for base in baseClasses -%}
+  {%- if existsIn(base, "friends") -%}
+    {%- for child in base.friends -%}
+      {%- if child.type == "class" or child.type == "struct" -%}
+        {%- set synopsis_is_inherited = true -%}
+        {%- include "synopsis_friend_class.tmpl" -%}
+        {%- set synopsis_is_inherited = false -%}
+        {%- set synopsis_needs_leading_line_break = true -%}
+      {%- endif -%}
+    {%- endfor -%}
+  {%- endif -%}
+{%- endfor -%}{%- endif -%}
+{%- if exists("publicAttributes") -%}
+  {%- for child in publicAttributes -%}
+    {%- include "synopsis_variable.tmpl" -%}
+    {%- set synopsis_needs_leading_line_break = true -%}
+  {%- endfor -%}
+{%- endif -%}
+{%- if exists("baseClasses") -%}{%- for base in baseClasses -%}
+  {%- if existsIn(base, "publicAttributes") -%}
+    {%- for child in base.publicAttributes -%}
+      {%- set synopsis_is_inherited = true -%}
+      {%- include "synopsis_variable.tmpl" -%}
+      {%- set synopsis_is_inherited = false -%}
+      {%- set synopsis_needs_leading_line_break = true -%}
+    {%- endfor -%}
+  {%- endif -%}
+{%- endfor -%}{%- endif -%}
+{%- if exists("publicFunctions") -%}
+  {%- for child in publicFunctions -%}
+    {%- include "synopsis_function.tmpl" -%}
+    {%- set synopsis_needs_leading_line_break = true -%}
+  {%- endfor -%}
+{%- endif -%}
+{%- if exists("baseClasses") -%}{%- for base in baseClasses -%}
+  {%- if existsIn(base, "publicFunctions") -%}
+    {%- for child in base.publicFunctions -%}
+      {%- set synopsis_is_inherited = true -%}
+      {%- include "synopsis_function.tmpl" -%}
+      {%- set synopsis_is_inherited = false -%}
+      {%- set synopsis_needs_leading_line_break = true -%}
+    {%- endfor -%}
+  {%- endif -%}
+{%- endfor -%}{%- endif -%}
+{%- if exists("friends") -%}
+  {%- for child in friends -%}
+    {%- if child.type != "class" and child.type != "struct" -%}
+      {%- include "synopsis_friend_function.tmpl" -%}
+      {%- set synopsis_needs_leading_line_break = true -%}
+    {%- endif -%}
+  {%- endfor -%}
+{%- endif -%}
+{%- if exists("baseClasses") -%}{%- for base in baseClasses -%}
+  {%- if existsIn(base, "friends") -%}
+    {%- for child in base.friends -%}
+      {%- if child.type != "class" and child.type != "struct" -%}
+        {%- set synopsis_is_inherited = true -%}
+        {%- include "synopsis_friend_function.tmpl" -%}
+        {%- set synopsis_is_inherited = false -%}
+        {%- set synopsis_needs_leading_line_break = true -%}
+      {%- endif -%}
+    {%- endfor -%}
+  {%- endif -%}
+{%- endfor -%}{%- endif -%}
+{%- if default(has_public_members, false) -%}
+  {%- if default(has_protected_members, false) -%}
+    <br>
+  {%- endif -%}
+{%- endif -%}
+{#- Reset leading line breaks for protected members -#}{{ noop() -}}
+{%- set synopsis_needs_leading_line_break = false -%}
+{%- if default(has_protected_members, false) -%}
+  <span>protected:</span>{{- noop() -}}
+{%- endif -%}
+{%- if exists("protectedTypes") -%}
+  {%- for child in protectedTypes -%}
+    {%- include "synopsis_type.tmpl" -%}
+    {%- set synopsis_needs_leading_line_break = true -%}
+  {%- endfor -%}
+{%- endif -%}
+{%- if exists("baseClasses") -%}{%- for base in baseClasses -%}
+  {%- if existsIn(base, "protectedTypes") -%}
+    {%- for child in base.protectedTypes -%}
+      {%- set synopsis_is_inherited = true -%}
+      {%- include "synopsis_type.tmpl" -%}
+      {%- set synopsis_is_inherited = false -%}
+      {%- set synopsis_needs_leading_line_break = true -%}
+    {%- endfor -%}
+  {%- endif -%}
+{%- endfor -%}{%- endif -%}
+{%- if exists("protectedClasses") -%}
+  {%- for child in protectedClasses -%}
+    {%- include "synopsis_class.tmpl" -%}
+    {%- set synopsis_needs_leading_line_break = true -%}
+  {%- endfor -%}
+{%- endif -%}
+{%- if exists("baseClasses") -%}{%- for base in baseClasses -%}
+  {%- if existsIn(base, "protectedClasses") -%}
+    {%- for child in base.protectedClasses -%}
+      {%- set synopsis_is_inherited = true -%}
+      {%- include "synopsis_class.tmpl" -%}
+      {%- set synopsis_is_inherited = false -%}
+      {%- set synopsis_needs_leading_line_break = true -%}
+    {%- endfor -%}
+  {%- endif -%}
+{%- endfor -%}{%- endif -%}
+{%- if exists("protectedAttributes") -%}
+  {%- for child in protectedAttributes -%}
+    {%- include "synopsis_variable.tmpl" -%}
+    {%- set synopsis_needs_leading_line_break = true -%}
+  {%- endfor -%}
+{%- endif -%}
+{%- if exists("baseClasses") -%}{%- for base in baseClasses -%}
+  {%- if existsIn(base, "protectedAttributes") -%}
+    {%- for child in base.protectedAttributes -%}
+      {%- set synopsis_is_inherited = true -%}
+      {%- include "synopsis_variable.tmpl" -%}
+      {%- set synopsis_is_inherited = false -%}
+      {%- set synopsis_needs_leading_line_break = true -%}
+    {%- endfor -%}
+  {%- endif -%}
+{%- endfor -%}{%- endif -%}
+{%- if exists("protectedFunctions") -%}
+  {%- for child in protectedFunctions -%}
+    {%- include "synopsis_function.tmpl" -%}
+    {%- set synopsis_needs_leading_line_break = true -%}
+  {%- endfor -%}
+{%- endif -%}
+{%- if exists("baseClasses") -%}{%- for base in baseClasses -%}
+  {%- if existsIn(base, "protectedFunctions") -%}
+    {%- for child in base.protectedFunctions -%}
+      {%- set synopsis_is_inherited = true -%}
+      {%- include "synopsis_function.tmpl" -%}
+      {%- set synopsis_is_inherited = false -%}
+      {%- set synopsis_needs_leading_line_break = true -%}
+    {%- endfor -%}
+  {%- endif -%}
+{%- endfor -%}{%- endif -%}
+{%- set synopsis_indent_width = 0 -%}
+<span>};</span>
+</code>
+
diff --git a/docs/doxybook/templates/class_members_details.tmpl b/docs/doxybook/templates/class_members_details.tmpl
new file mode 100644
index 000000000..a77eec5ef
--- /dev/null
+++ b/docs/doxybook/templates/class_members_details.tmpl
@@ -0,0 +1,49 @@
+{%- if exists("publicClasses") -%}## Member Classes
+
+  {%- for child in publicClasses -%}
+    {% include "title_member.tmpl" %}
+    {{- render("member_details.tmpl", child) -}}
+  {%- endfor %}
+{%- endif -%}
+{%- if exists("publicTypes") -%}## Member Types
+
+  {%- for child in publicTypes -%}
+    {% include "title_member.tmpl" %}
+    {{- render("member_details.tmpl", child) -}}
+  {%- endfor %}
+{%- endif -%}
+{%- if exists("publicAttributes") %}## Member Variables
+
+  {%- for child in publicAttributes -%}
+    {% include "title_member.tmpl" %}
+    {{- render("member_details.tmpl", child) -}}
+  {%- endfor %}
+{%- endif -%}
+{%- if exists("publicFunctions") %}## Member Functions
+
+  {%- for child in publicFunctions -%}
+    {% include "title_member.tmpl" %}
+    {{- render("member_details.tmpl", child) -}}
+  {%- endfor %}
+{%- endif -%}
+{%- if exists("protectedTypes") -%}## Protected Member Types
+  {%- for child in publicTypes -%}
+    {% include "title_member.tmpl" %}
+    {{- render("member_details.tmpl", child) -}}
+  {%- endfor %}
+{% endif -%}
+{%- if exists("protectedAttributes") -%}## Protected Member Variables
+
+  {%- for child in protectedAttributes -%}
+    {% include "title_member.tmpl" %}
+    {{- render("member_details.tmpl", child) -}}
+  {%- endfor %}
+{%- endif -%}
+{%- if exists("protectedFunctions") -%}## Protected Member Functions
+
+  {%- for child in protectedFunctions -%}
+    {% include "title_member.tmpl" %}
+    {{- render("member_details.tmpl", child) -}}
+  {%- endfor %}
+{%- endif -%}
+
diff --git a/docs/doxybook/templates/details.tmpl b/docs/doxybook/templates/details.tmpl
new file mode 100644
index 000000000..d72119abf
--- /dev/null
+++ b/docs/doxybook/templates/details.tmpl
@@ -0,0 +1,206 @@
+{%- if exists("brief") -%}{{brief}}
+
+{% endif -%}
+{%- if exists("details") -%}{{details}}
+
+{% endif -%}
+{%- if exists("inbody") -%}{{inbody}}
+
+{% endif -%}
+{%- if exists("tests") -%}**Test**:
+  {%- if length(tests) == 1 -%}{{first(tests)}}
+  {%- else -%}
+    {%- for item in tests -%}* {{item}}
+    {%- endfor -%}
+  {%- endif %}
+{% endif -%}
+{%- if exists("note") -%}**Note**:
+  {%- if length(note) == 1 -%}{{first(note)}}
+  {%- else -%}
+    {%- for item in note -%}* {{item}}
+    {%- endfor -%}
+  {%- endif %}
+{% endif -%}
+{%- if exists("remark") -%}**Remark**:
+  {%- if length(remark) == 1 -%}{{first(remark)}}
+  {%- else -%}
+    {%- for item in remark -%}* {{item}}
+    {%- endfor -%}
+  {%- endif %}
+{% endif -%}
+{%- if exists("attention") -%}**Attention**:
+  {%- if length(attention) == 1 -%}{{first(attention)}}
+  {%- else -%}
+    {%- for item in attention -%}* {{item}}
+    {%- endfor -%}
+  {%- endif %}
+{% endif -%}
+{%- if exists("bugs") -%}**Bug**:
+  {%- if length(bugs) == 1 -%}{{first(bugs)}}
+  {%- else -%}
+    {%- for item in bugs -%}* {{item}}
+    {%- endfor -%}
+  {%- endif %}
+{% endif -%}
+{%- if exists("warning") -%}**Warning**:
+  {%- if length(warning) == 1 -%}{{first(warning)}}
+  {%- else -%}
+    {%- for item in warning -%}* {{item}}
+    {%- endfor -%}
+  {%- endif %}
+{% endif -%}
+{%- if exists("todos") -%}**TODO**:
+  {%- if length(todos) == 1 -%}{{first(todos)}}
+  {%- else -%}
+    {%- for item in todos -%}* {{item}}
+    {%- endfor -%}
+  {%- endif %}
+{% endif -%}
+{%- if exists("templateParamsList") -%}**Template Parameters**:
+  {%- if length(templateParamsList) == 1 -%}**`{{get(first(templateParamsList), "name")}}`**: {{get(first(templateParamsList), "text")}}
+  {%- else -%}
+    {%- for param in templateParamsList -%}* **`{{param.name}}`** {{param.text}}
+    {%- endfor -%}
+  {%- endif %}
+{% endif -%}
+{%- if exists("paramList") -%}**Function Parameters**:
+  {%- if length(paramList) == 1 -%}**`{{get(first(paramList), "name")}}`**: {{get(first(paramList), "text")}}
+  {%- else -%}
+    {%- for param in paramList -%}* **`{{param.name}}`** {{param.text}}
+    {%- endfor -%}
+  {%- endif %}
+{% endif -%}
+{%- if exists("pre") -%}**Preconditions**:
+  {%- if length(pre) == 1 -%}{{first(pre)}}
+  {%- else -%}
+    {%- for item in pre -%}* {{item}}
+    {%- endfor -%}
+  {%- endif %}
+{% endif -%}
+{%- if exists("post") -%}**Postconditions**:
+  {%- if length(post) == 1 -%}{{first(post)}}
+  {%- else -%}
+    {%- for item in post -%}* {{item}}
+    {%- endfor -%}
+  {%- endif %}
+{% endif -%}
+{%- if exists("invariant") -%}**Invariant**:
+  {%- if length(invariant) == 1 -%}{{first(invariant)}}
+  {%- else -%}
+    {%- for item in invariant -%}* {{item}}
+    {%- endfor -%}
+  {%- endif %}
+{% endif -%}
+{%- if exists("returns") or exists("returnsList") -%}**Returns**:
+  {%- if exists("returns") and exists("returnsList") -%}
+    {%- for item in returns -%}* {{item}}
+    {%- endfor -%}
+    {%- for item in returnsList -%}* **`{{item.name}}`**: {{item.text}}
+    {%- endfor -%}
+  {%- else if exists("returns") -%}
+    {%- if length(returns) == 1 -%}{{first(returns)}}
+    {%- else -%} 
+      {%- for item in returns -%}* {{item}}
+      {%- endfor -%}
+    {%- endif -%}
+  {%- else if exists("returnsList") -%}
+    {%- if length(returnsList) == 1 -%}**`{{get(first(returnsList), "name")}}`** {{get(first(returnsList), "text")}}
+    {%- else -%} 
+      {%- for item in returnsList -%}* **`{{item.name}}`**: {{item.text}}
+      {%- endfor -%}
+    {%- endif -%}
+  {%- endif %}
+{% endif -%}
+{%- if exists("exceptionsList") -%}**Exceptions**:
+  {%- if length(exceptionsList) == 1 -%}**`{{get(first(exceptionsList), "name")}}`**: {{get(first(exceptionsList), "text")}}
+  {%- else -%}
+    {%- for param in exceptionsList -%}* **`{{param.name}}`**: {{param.text}}
+    {%- endfor -%}
+  {%- endif %}
+{% endif -%}
+{%- if exists("deprecated") -%}**Deprecated**: {{deprecated}}
+
+{% endif -%}
+{%- if exists("authors") -%}**Author**:
+  {%- if length(authors) == 1 -%}{{first(authors)}}
+  {%- else -%}
+    {%- for item in authors -%}* {{item}}
+    {%- endfor -%}
+  {%- endif %}
+{% endif -%}
+{%- if exists("copyright") -%}**Copyright**:
+  {%- if length(copyright) == 1 -%}{{first(copyright)}}
+  {%- else -%}
+    {%- for item in copyright -%}* {{item}}
+    {%- endfor -%}
+  {%- endif %}
+{% endif -%}
+{%- if exists("version") -%}**Version**:
+  {%- if length(version) == 1 -%}{{first(version)}}
+  {%- else -%}
+    {%- for item in version -%}* {{item}}
+    {%- endfor -%}
+  {%- endif %}
+{% endif -%}
+{%- if exists("since") -%}**Since**:
+  {%- if length(since) == 1 -%}{{first(since)}}
+  {%- else -%}
+    {%- for item in since -%}* {{item}}
+    {%- endfor -%}
+  {%- endif %}
+{% endif -%}
+{%- if exists("date") -%}**Date**:
+  {%- if length(date) == 1 -%}{{first(date)}}
+  {%- else -%}
+    {%- for item in date -%}* {{item}}
+    {%- endfor -%}
+  {%- endif %}
+{% endif -%}
+{%- if exists("baseClasses") -%}**Inherits From**:
+  {%- if length(baseClasses) == 1 -%}
+    {%- if existsIn(first(baseClasses), "url") -%}[`{{get(first(baseClasses), "name")}}`]({{get(first(baseClasses), "url")}})
+    {%- else -%}`{{get(first(baseClasses), "name")}}`
+    {%- endif -%}
+  {%- else -%}
+    {%- for base in baseClasses -%}
+      {%- if existsIn(baseClasses, "url") -%}* [`{{base.name}}`]({{base.url}})
+      {%- else -%}* `{{base.name}}`
+      {%- endif -%}
+    {%- endfor -%}
+  {%- endif %}
+{% endif -%}
+{%- if exists("derivedClasses") -%}**Inherited By**:
+  {%- if length(derivedClasses) == 1 -%}
+    {%- if existsIn(first(derivedClasses), "url") -%}[`{{get(first(derivedClasses), "name")}}`]({{get(first(derivedClasses), "url")}})
+    {%- else -%}`{{get(first(derivedClasses), "name")}}`
+    {%- endif -%}
+  {%- else -%}
+    {%- for derived in derivedClasses -%}
+      {%- if existsIn(first(derivedClasses), "url") -%}* [`{{derived.name}}`]({{derived.url}})
+      {%- else -%}* `{{derived.name}}`{%- endif -%}
+    {%- endfor -%}
+  {%- endif %}
+{% endif -%}
+{%- if exists("reimplements") -%}**Implements**: [`{{reimplements.name}}`]({{reimplements.url}})
+
+{% endif -%}
+{%- if exists("reimplementedBy") -%}**Implemented By**:
+  {%- if length(reimplementedBy) == 1 -%}
+    {%- if existsIn(first(reimplementedBy), "url") -%}[`{{get(first(reimplementedBy), "name")}}`]({{get(first(reimplementedBy), "url")}})
+    {%- else -%}`{{get(first(reimplementedBy), "name")}}`
+    {%- endif -%}
+  {%- else -%}
+    {%- for impl in reimplementedBy -%}
+      {%- if existsIn(first(reimplementedBy), "url") -%}* [`{{impl.name}}`]({{impl.url}})
+      {%- else -%}* `{{impl.name}}`
+      {%- endif -%}
+    {%- endfor -%}
+  {%- endif %}
+{% endif -%}
+{%- if exists("see") -%}**See**:
+  {%- if length(see) == 1 -%}{{first(see)}}
+  {%- else -%}
+    {%- for item in see -%}* {{item}}
+    {%- endfor -%}
+  {%- endif %}
+{% endif -%}
diff --git a/docs/doxybook/templates/frontmatter.tmpl b/docs/doxybook/templates/frontmatter.tmpl
new file mode 100644
index 000000000..d3b1e5b4f
--- /dev/null
+++ b/docs/doxybook/templates/frontmatter.tmpl
@@ -0,0 +1,43 @@
+---
+{%- if exists("title") -%}
+  title: {{title}}
+{%- else if exists("name") -%}
+  title: {{name}}
+{%- endif -%}
+{%- if exists("summary") -%}
+  summary: {{summary}}
+{%- endif -%}
+{%- if exists("moduleBreadcrumbs") -%}
+  {%- if length(moduleBreadcrumbs) > 0 -%}
+    parent: {{ get(last(moduleBreadcrumbs), "title") }}
+  {%- endif -%}
+  {%- if length(moduleBreadcrumbs) > 1 -%}
+    grand_parent: {{ get(index(moduleBreadcrumbs, -2), "title") }}
+  {%- else if length(moduleBreadcrumbs == 1) and exists("kind") and kind == "group" -%}
+    grand_parent: API
+  {%- endif -%}
+{%- else if exists("kind") and kind == "group" -%}
+  parent: API
+{%- endif -%}
+{%- if exists("kind") and kind == "group" -%}
+  nav_exclude: false
+{%- else -%}
+  nav_exclude: true
+{%- endif -%}
+has_children: true
+has_toc: false
+---
+
+{%- if exists("title") -%}
+  {%- if exists("kind") and kind in ["class", "struct", "namespace"] -%}
+    # {{title(kind)}} `{{title}}`
+  {%- else -%}
+    # {{title}}
+  {%- endif -%}
+{%- else if exists("name") -%}
+  {%- if exists("kind") and kind != "page" -%}
+    # {{name}} {{title(kind)}} Reference
+  {%- else -%}
+    # {{name}}
+  {%- endif -%}
+{%- endif %}
diff --git a/docs/doxybook/templates/index.tmpl b/docs/doxybook/templates/index.tmpl
new file mode 100644
index 000000000..e28f37729
--- /dev/null
+++ b/docs/doxybook/templates/index.tmpl
@@ -0,0 +1,14 @@
+{%- if exists("children") -%}{%- for child in children -%}
+  {%- for i in range(default(index_depth, 0)) -%}
+    {{- noop() }}  {{ noop() -}}
+  {%- endfor -%}
+  * {{ noop() -}}
+  <b><a href="{{ child.url }}">{{ render("name_qualified.tmpl", child) }}</a></b>{{ noop() -}}
+  {%- if existsIn(child, "brief") -%}
+    {{- noop() }} <br> {{ child.brief -}}
+  {%- endif %}
+  {%- if existsIn(child, "children") -%}
+    {%- set child.index_depth = default(index_depth, 0) + 1 -%}
+    {{- render("index.tmpl", child) -}}
+  {%- endif -%}
+{%- endfor -%}{%- endif -%}
diff --git a/docs/doxybook/templates/index_classes.tmpl b/docs/doxybook/templates/index_classes.tmpl
new file mode 100644
index 000000000..1ccdf71e9
--- /dev/null
+++ b/docs/doxybook/templates/index_classes.tmpl
@@ -0,0 +1,2 @@
+{% include "frontmatter.tmpl" -%}
+{% include "index.tmpl" -%}
diff --git a/docs/doxybook/templates/index_examples.tmpl b/docs/doxybook/templates/index_examples.tmpl
new file mode 100644
index 000000000..1ccdf71e9
--- /dev/null
+++ b/docs/doxybook/templates/index_examples.tmpl
@@ -0,0 +1,2 @@
+{% include "frontmatter.tmpl" -%}
+{% include "index.tmpl" -%}
diff --git a/docs/doxybook/templates/index_files.tmpl b/docs/doxybook/templates/index_files.tmpl
new file mode 100644
index 000000000..1ccdf71e9
--- /dev/null
+++ b/docs/doxybook/templates/index_files.tmpl
@@ -0,0 +1,2 @@
+{% include "frontmatter.tmpl" -%}
+{% include "index.tmpl" -%}
diff --git a/docs/doxybook/templates/index_groups.tmpl b/docs/doxybook/templates/index_groups.tmpl
new file mode 100644
index 000000000..1ccdf71e9
--- /dev/null
+++ b/docs/doxybook/templates/index_groups.tmpl
@@ -0,0 +1,2 @@
+{% include "frontmatter.tmpl" -%}
+{% include "index.tmpl" -%}
diff --git a/docs/doxybook/templates/index_namespaces.tmpl b/docs/doxybook/templates/index_namespaces.tmpl
new file mode 100644
index 000000000..1ccdf71e9
--- /dev/null
+++ b/docs/doxybook/templates/index_namespaces.tmpl
@@ -0,0 +1,2 @@
+{% include "frontmatter.tmpl" -%}
+{% include "index.tmpl" -%}
diff --git a/docs/doxybook/templates/index_pages.tmpl b/docs/doxybook/templates/index_pages.tmpl
new file mode 100644
index 000000000..1ccdf71e9
--- /dev/null
+++ b/docs/doxybook/templates/index_pages.tmpl
@@ -0,0 +1,2 @@
+{% include "frontmatter.tmpl" -%}
+{% include "index.tmpl" -%}
diff --git a/docs/doxybook/templates/kind_class.tmpl b/docs/doxybook/templates/kind_class.tmpl
new file mode 100644
index 000000000..e5650b69b
--- /dev/null
+++ b/docs/doxybook/templates/kind_class.tmpl
@@ -0,0 +1,4 @@
+{% include "frontmatter.tmpl" -%}
+{%- if hasDetails -%}{% include "details.tmpl" -%}{%- endif -%}
+{% include "class_members.tmpl" -%}
+{% include "class_members_details.tmpl" -%}
diff --git a/docs/doxybook/templates/kind_example.tmpl b/docs/doxybook/templates/kind_example.tmpl
new file mode 100644
index 000000000..48501318b
--- /dev/null
+++ b/docs/doxybook/templates/kind_example.tmpl
@@ -0,0 +1,2 @@
+{% include "frontmatter.tmpl" -%}
+{%- if exists("details") -%}{{details}}{%- endif -%}
diff --git a/docs/doxybook/templates/kind_file.tmpl b/docs/doxybook/templates/kind_file.tmpl
new file mode 100644
index 000000000..c883442f1
--- /dev/null
+++ b/docs/doxybook/templates/kind_file.tmpl
@@ -0,0 +1,10 @@
+{% include "frontmatter.tmpl" -%}
+{%- if hasDetails -%}{% include "details.tmpl" -%}{%- endif -%}
+{% include "nonclass_members_details.tmpl" -%}
+{% include "nonclass_members.tmpl" -%}
+{%- if exists("programlisting") -%}
+
+```cpp
+{{programlisting}}
+```
+{%- endif -%}
diff --git a/docs/doxybook/templates/kind_group.tmpl b/docs/doxybook/templates/kind_group.tmpl
new file mode 100644
index 000000000..1ff7342a4
--- /dev/null
+++ b/docs/doxybook/templates/kind_group.tmpl
@@ -0,0 +1,4 @@
+{% include "frontmatter.tmpl" -%}
+{%- if hasDetails %}{% include "details.tmpl" -%}{%- endif -%}
+{% include "nonclass_members.tmpl" -%}
+{% include "nonclass_members_details.tmpl" -%}
diff --git a/docs/doxybook/templates/kind_nonclass.tmpl b/docs/doxybook/templates/kind_nonclass.tmpl
new file mode 100644
index 000000000..299208c41
--- /dev/null
+++ b/docs/doxybook/templates/kind_nonclass.tmpl
@@ -0,0 +1,8 @@
+{% include "frontmatter.tmpl" -%}
+{%- if hasDetails %}{% include "details.tmpl" -%}{%- endif -%}
+{% if kind == "namespace" -%}
+  {%- include "namespace_members.tmpl" -%}
+{%- else -%}
+  {%- include "nonclass_members.tmpl" -%}
+{%- endif -%}
+{% include "nonclass_members_details.tmpl" -%}
diff --git a/docs/doxybook/templates/kind_page.tmpl b/docs/doxybook/templates/kind_page.tmpl
new file mode 100644
index 000000000..48501318b
--- /dev/null
+++ b/docs/doxybook/templates/kind_page.tmpl
@@ -0,0 +1,2 @@
+{% include "frontmatter.tmpl" -%}
+{%- if exists("details") -%}{{details}}{%- endif -%}
diff --git a/docs/doxybook/templates/member_details.tmpl b/docs/doxybook/templates/member_details.tmpl
new file mode 100644
index 000000000..14b34dcfc
--- /dev/null
+++ b/docs/doxybook/templates/member_details.tmpl
@@ -0,0 +1,39 @@
+{%- if exists("type") and type in ["class", "struct"] -%}
+  <code class="doxybook">
+  {%- include "synopsis_class.tmpl" -%}
+  </code>
+{%- else if kind == "enum" -%}
+  {%- include "table_header_enum.tmpl" -%}
+  {%- for enumerator in enumvalues -%}{{- render("table_row_enum.tmpl", enumerator) -}}
+  {%- endfor %}
+{%- else if kind in ["typedef", "using"] -%}
+  <code class="doxybook">
+  {%- include "synopsis_template_parameters.tmpl" -%}
+  <span>{%- include "synopsis_kind.tmpl" -%}<b>{{name}}</b>{%- include "synopsis_initializer.tmpl" -%};</span>{{- noop() -}}
+  </code>
+{%- else if kind in ["variable", "property"] -%}
+  <code class="doxybook">
+  {%- include "synopsis_template_parameters.tmpl" -%}
+  <span>{%- include "synopsis_type_and_leading_specifiers.tmpl" -%}<b>{{name}}</b>{%- include "synopsis_initializer.tmpl" -%};</span>{{- noop() -}}
+  </code>
+{%- else if kind in ["function", "slot", "signal", "event"] -%}
+  <code class="doxybook">
+  {%- include "synopsis_template_parameters.tmpl" -%}
+  {% include "synopsis_function_type_and_leading_specifiers.tmpl" -%}
+  <span><b>{{name}}</b>({%- include "synopsis_function_parameters.tmpl" -%}){%- include "synopsis_function_trailing_specifiers.tmpl" -%};</span>{{- noop() -}}
+  </code>
+{%- else if kind == "friend" -%}
+  {%- if type != "class" and type != "struct" -%}
+    <code class="doxybook">
+    {% include "synopsis_template_parameters.tmpl" -%}
+    {% include "synopsis_function_type_and_leading_specifiers.tmpl" -%}
+    <span><b>{{name}}</b>({%- include "synopsis_function_parameters.tmpl" -%}){%- include "synopsis_function_trailing_specifiers.tmpl" -%};</span>{{- noop() -}}
+    </code>
+  {%- endif -%}
+{%- else if kind == "define" -%}
+  {#- We have no way to get the parameters to function-like     -#}{{ noop() -}}
+  {#- macros, and the macro definitions in `initializer` fields -#}{{ noop() -}}
+  {#- don't have line breaks. So we can't render a useful       -#}{{ noop() -}}
+  {#- synopsis.                                                 -#}{{ noop() -}}
+{% endif -%}
+{% include "details.tmpl" -%}
diff --git a/docs/doxybook/templates/name.tmpl b/docs/doxybook/templates/name.tmpl
new file mode 100644
index 000000000..09f15420e
--- /dev/null
+++ b/docs/doxybook/templates/name.tmpl
@@ -0,0 +1,5 @@
+{%- if default(names_qualified, true) -%}
+  {{- render("name_qualified.tmpl", child) -}}
+{%- else -%}
+  {{- render("name_unqualified.tmpl", child) -}}
+{%- endif -%}
diff --git a/docs/doxybook/templates/name_qualified.tmpl b/docs/doxybook/templates/name_qualified.tmpl
new file mode 100644
index 000000000..da088dd34
--- /dev/null
+++ b/docs/doxybook/templates/name_qualified.tmpl
@@ -0,0 +1,7 @@
+{%- if exists("qualifiedname") -%}
+  {{- escape(qualifiedname) -}}
+{%- else if exists("name") -%}
+  {{- escape(name) -}}
+{%- else -%}
+  {{- escape(title) -}}
+{%- endif -%}
diff --git a/docs/doxybook/templates/name_unqualified.tmpl b/docs/doxybook/templates/name_unqualified.tmpl
new file mode 100644
index 000000000..2a0d73725
--- /dev/null
+++ b/docs/doxybook/templates/name_unqualified.tmpl
@@ -0,0 +1,5 @@
+{%- if exists("name") -%}
+  {{- escape(stripNamespace(name)) -}}
+{%- else -%}
+  {{- escape(stripNamespace(title)) -}}
+{%- endif -%}
diff --git a/docs/doxybook/templates/namespace_members.tmpl b/docs/doxybook/templates/namespace_members.tmpl
new file mode 100644
index 000000000..8bb4bdffc
--- /dev/null
+++ b/docs/doxybook/templates/namespace_members.tmpl
@@ -0,0 +1,43 @@
+<code class="doxybook">
+{%- if exists("includes") -%}
+  <span>#include {{includes}}</span>{{ noop() -}}
+  <br>
+{%- endif -%}
+<span>{%- include "synopsis_kind.tmpl" -%}{% include "name_qualified.tmpl" %} {</span>
+{%- set synopsis_needs_leading_line_break = true -%}
+{%- set names_qualified = false -%}
+{%- if exists("namespaces") -%}
+  {%- for child in namespaces -%}
+    {%- include "synopsis_namespace_abbreviated.tmpl" -%}
+  {%- endfor -%}
+{%- endif -%}
+{%- if exists("publicClasses") -%}
+  {%- for child in publicClasses -%}
+    {%- include "synopsis_class.tmpl" -%}
+  {%- endfor -%}
+{%- endif -%}
+{%- if exists("publicTypes") -%}
+  {%- for child in publicTypes -%}
+    {%- include "synopsis_type.tmpl" -%}
+  {%- endfor -%}
+{%- endif -%}
+{%- if exists("publicAttributes") -%}
+  {%- for child in publicAttributes -%}
+    {%- include "synopsis_variable.tmpl" -%}
+  {%- endfor -%}
+{%- endif -%}
+{%- if exists("publicFunctions") -%}
+  {%- for child in publicFunctions -%}
+    {%- if existsIn(child, "type") -%}
+      {#- If the child doesn't have a type, it's probably a    -#}{{- noop() -}}
+      {#- constructor that Doxygen put into a non-class entity -#}{{- noop() -}}
+      {#- due to a bug whose nature is beyond me.              -#}{{- noop() -}}
+      {%- include "synopsis_function.tmpl" -%}
+    {%- endif -%}
+  {%- endfor -%}
+{%- endif -%}
+<span>} {{ noop() -}}
+  /* {%- include "synopsis_kind.tmpl" -%}{% include "name_qualified.tmpl" %} */{{ noop() -}}
+</span>
+</code>
+
diff --git a/docs/doxybook/templates/nonclass_members.tmpl b/docs/doxybook/templates/nonclass_members.tmpl
new file mode 100644
index 000000000..af3d39c17
--- /dev/null
+++ b/docs/doxybook/templates/nonclass_members.tmpl
@@ -0,0 +1,60 @@
+{%- if exists("groups") %}## Groups
+
+  {%- for child in sort(groups) -%}* **[{{ child.title }}]({{ child.url }})**{% if existsIn(child, "brief") %}: {{ child.brief }}{% endif %}
+  {%- endfor %}
+{% endif -%}
+{%- if exists("dirs") %}## Directories
+
+  {%- for child in dirs -%}* **[`{{ child.name }}`]({{ child.url }})**{% if existsIn(child, "brief") %}: {{ child.brief }}{% endif %}
+  {%- endfor %}
+{% endif -%}
+{%- if exists("files") %}## Files
+
+  {%- include "table_header_brief.tmpl" -%}
+  {%- for child in files -%}{{- render("table_row_brief.tmpl", child) -}}
+  {%- endfor %}
+{% endif -%}
+<code class="doxybook">
+{%- if exists("namespaces") -%}
+  {%- for child in namespaces -%}
+    {%- include "synopsis_namespace_abbreviated.tmpl" -%}
+    {%- set synopsis_needs_leading_line_break = true -%}
+  {%- endfor -%}
+{%- endif -%}
+{%- if exists("publicClasses") -%}
+  {%- for child in publicClasses -%}
+    {%- include "synopsis_class.tmpl" -%}
+    {%- set synopsis_needs_leading_line_break = true -%}
+  {%- endfor -%}
+{%- endif -%}
+{%- if exists("publicTypes") -%}
+  {%- for child in publicTypes -%}
+    {%- include "synopsis_type.tmpl" -%}
+    {%- set synopsis_needs_leading_line_break = true -%}
+  {%- endfor -%}
+{%- endif -%}
+{%- if exists("publicAttributes") -%}
+  {%- for child in publicAttributes -%}
+    {%- include "synopsis_variable.tmpl" -%}
+    {%- set synopsis_needs_leading_line_break = true -%}
+  {%- endfor -%}
+{%- endif -%}
+{%- if exists("publicFunctions") -%}
+  {%- for child in publicFunctions -%}
+    {%- if existsIn(child, "type") -%}
+      {#- If the child doesn't have a type, it's probably a    -#}{{- noop() -}}
+      {#- constructor that Doxygen put into a non-class entity -#}{{- noop() -}}
+      {#- due to a bug whose nature is beyond me.              -#}{{- noop() -}}
+      {%- include "synopsis_function.tmpl" -%}
+      {%- set synopsis_needs_leading_line_break = true -%}
+    {%- endif -%}
+  {%- endfor -%}
+{%- endif -%}
+{%- if exists("defines") -%}
+  {%- for child in defines -%}
+    {%- include "synopsis_macro.tmpl" -%}
+    {%- set synopsis_needs_leading_line_break = true -%}
+  {%- endfor -%}
+{%- endif -%}
+</code>
+
diff --git a/docs/doxybook/templates/nonclass_members_details.tmpl b/docs/doxybook/templates/nonclass_members_details.tmpl
new file mode 100644
index 000000000..c941f22f7
--- /dev/null
+++ b/docs/doxybook/templates/nonclass_members_details.tmpl
@@ -0,0 +1,35 @@
+{%- if exists("publicClasses") -%}## Member Classes
+
+  {%- for child in publicClasses -%}
+    {% include "title_nonmember.tmpl" %}
+    {{- render("member_details.tmpl", child) -}}
+  {%- endfor %}
+{%- endif -%}
+{%- if exists("publicTypes") -%}## Types
+
+  {%- for child in publicTypes -%}
+    {% include "title_nonmember.tmpl" %}
+    {{- render("member_details.tmpl", child) -}}
+  {%- endfor %}
+{%- endif -%}
+{%- if exists("publicAttributes") %}## Variables
+
+  {%- for child in publicAttributes -%}
+    {% include "title_nonmember.tmpl" %}
+    {{- render("member_details.tmpl", child) -}}
+  {%- endfor %}
+{%- endif -%}
+{%- if exists("publicFunctions") %}## Functions
+
+  {%- for child in publicFunctions -%}
+    {% include "title_nonmember.tmpl" %}
+    {{- render("member_details.tmpl", child) -}}
+  {%- endfor %}
+{%- endif -%}
+{%- if exists("defines") %}## Macros
+
+  {%- for child in defines -%}
+    {% include "title_nonmember.tmpl" %}
+    {{- render("member_details.tmpl", child) -}}
+  {%- endfor %}
+{%- endif -%}
diff --git a/docs/doxybook/templates/synopsis_brief.tmpl b/docs/doxybook/templates/synopsis_brief.tmpl
new file mode 100644
index 000000000..2f48cec1d
--- /dev/null
+++ b/docs/doxybook/templates/synopsis_brief.tmpl
@@ -0,0 +1,8 @@
+{%- if exists("brief") -%}
+  <span class="doxybook-comment">{{ noop() -}}
+    {%- if default(synopsis_indent_width, 0) != 0 -%}
+      <code>{%- include "synopsis_indent.tmpl" -%}</code>
+    {%- endif -%}
+    /* {{ brief }} */{{ noop() -}}
+  </span>{{ noop() -}}
+{%- endif -%}
diff --git a/docs/doxybook/templates/synopsis_class.tmpl b/docs/doxybook/templates/synopsis_class.tmpl
new file mode 100644
index 000000000..a5492997c
--- /dev/null
+++ b/docs/doxybook/templates/synopsis_class.tmpl
@@ -0,0 +1,16 @@
+{%- include "synopsis_leading_line_break.tmpl" -%}
+{%- include "synopsis_inherited_from.tmpl" -%}
+{%- set child.synopsis_indent_width = default(synopsis_indent_width, 0) -%}
+{{- render("synopsis_brief.tmpl", child) -}}
+{#- The Doxygen metadata that a parent has on its nested   -#}{{ noop() -}}
+{#- classes doesn't include their template parameters.     -#}{{ noop() -}}
+{#- Fortunately, we have the refid of the nested class, so -#}{{ noop() -}}
+{#- so we can just load the data from their page.          -#}{{ noop() -}}
+{%- set child_class = load(child.refid)) -%}
+{%- set child_class.synopsis_indent_width = default(synopsis_indent_width, 0) -%}
+{{- render("synopsis_template_parameters.tmpl", child_class) -}}
+<span>{{ noop() -}}
+  {%- include "synopsis_indent.tmpl" -%}
+  {{- render("synopsis_kind_abbreviated.tmpl", child) -}}
+  <b><a href="{{ child.url }}">{% include "name.tmpl" %}</a></b>;{{ noop() -}}
+</span>
diff --git a/docs/doxybook/templates/synopsis_friend_class.tmpl b/docs/doxybook/templates/synopsis_friend_class.tmpl
new file mode 100644
index 000000000..39f23bb09
--- /dev/null
+++ b/docs/doxybook/templates/synopsis_friend_class.tmpl
@@ -0,0 +1,14 @@
+{%- include "synopsis_leading_line_break.tmpl" -%}
+{%- include "synopsis_inherited_from.tmpl" -%}
+{%- set child.synopsis_indent_width = default(synopsis_indent_width, 0) -%}
+{{- render("synopsis_brief.tmpl", child) -}}
+{{- render("synopsis_template_parameters.tmpl", child) -}}
+{#- Unfortunately, the refid and URL for a friend class  -#}{{ noop() -}}
+{#- incorrectly refers to a definition on the local      -#}{{ noop() -}}
+{#- page, instead of the friend class's own page.        -#}{{ noop() -}}
+{#- So we don't link to friend classes.                  -#}{{ noop() -}}
+<span>{{ noop() -}}
+  {%- include "synopsis_indent.tmpl" -%}
+  {{- render("synopsis_kind_abbreviated.tmpl", child) -}}
+  <b>{{- render("name_qualified.tmpl", child) -}}</b>;{{ noop() -}}
+</span>
diff --git a/docs/doxybook/templates/synopsis_friend_function.tmpl b/docs/doxybook/templates/synopsis_friend_function.tmpl
new file mode 100644
index 000000000..440989c23
--- /dev/null
+++ b/docs/doxybook/templates/synopsis_friend_function.tmpl
@@ -0,0 +1,19 @@
+{%- include "synopsis_leading_line_break.tmpl" -%}
+{%- include "synopsis_inherited_from.tmpl" -%}
+{%- set child.synopsis_indent_width = default(synopsis_indent_width, 0) -%}
+{{- render("synopsis_brief.tmpl", child) -}}
+{{- render("synopsis_template_parameters.tmpl", child) -}}
+{#- Unfortunately, the refid and URL for a friend class  -#}{{ noop() -}}
+{#- incorrectly refers to a definition on the local      -#}{{ noop() -}}
+{#- page, instead of the friend class's own page.        -#}{{ noop() -}}
+{#- So we don't link to friend classes.                  -#}{{ noop() -}}
+<span>{{ noop() -}}
+  {%- include "synopsis_indent.tmpl" -%}
+  friend {{- render("synopsis_type_and_leading_specifiers.tmpl", child) -}}
+</span>
+<span>{{ noop() -}}
+  {%- include "synopsis_indent.tmpl" -%}
+  <b>{{- render("name_qualified.tmpl", child) -}}</b>{{ noop() -}}
+  ({{- render("synopsis_function_parameters.tmpl", child) -}}){{ noop() -}}
+  {{- render("synopsis_function_trailing_specifiers.tmpl", child) -}};{{ noop() -}}
+</span>
diff --git a/docs/doxybook/templates/synopsis_function.tmpl b/docs/doxybook/templates/synopsis_function.tmpl
new file mode 100644
index 000000000..93a3e822e
--- /dev/null
+++ b/docs/doxybook/templates/synopsis_function.tmpl
@@ -0,0 +1,12 @@
+{%- include "synopsis_leading_line_break.tmpl" -%}
+{%- include "synopsis_inherited_from.tmpl" -%}
+{%- set child.synopsis_indent_width = default(synopsis_indent_width, 0) -%}
+{{- render("synopsis_brief.tmpl", child) -}}
+{{- render("synopsis_template_parameters.tmpl", child) -}}
+{{- render("synopsis_function_type_and_leading_specifiers.tmpl", child) -}}
+<span>{{ noop() -}}
+  {%- include "synopsis_indent.tmpl" -%}
+  <b><a href="{{ child.url }}">{% include "name.tmpl" %}</a></b>{{ noop() -}}
+  ({{- render("synopsis_function_parameters.tmpl", child) -}}){{ noop() -}}
+  {{- render("synopsis_function_trailing_specifiers.tmpl", child) -}};{{- noop() -}}
+</span>
diff --git a/docs/doxybook/templates/synopsis_function_parameters.tmpl b/docs/doxybook/templates/synopsis_function_parameters.tmpl
new file mode 100644
index 000000000..204a52c50
--- /dev/null
+++ b/docs/doxybook/templates/synopsis_function_parameters.tmpl
@@ -0,0 +1,11 @@
+{%- for param in params -%}
+  {%- if not loop.is_first -%}&nbsp;&nbsp;{%- endif -%}
+  {{- param.type -}}
+  {%- if not isEmpty(param.name) %} {% endif -%}
+  {{- param.name -}}
+  {%- if existsIn(param, "defvalPlain") %} = {{ escape(param.defvalPlain) }}{% endif -%}
+  {%- if not loop.is_last -%}
+    ,</span>
+    {{- noop() }}<span>{% include "synopsis_indent.tmpl" -%}
+  {%- endif -%}
+{%- endfor -%}
diff --git a/docs/doxybook/templates/synopsis_function_trailing_specifiers.tmpl b/docs/doxybook/templates/synopsis_function_trailing_specifiers.tmpl
new file mode 100644
index 000000000..bbde0f1dd
--- /dev/null
+++ b/docs/doxybook/templates/synopsis_function_trailing_specifiers.tmpl
@@ -0,0 +1,5 @@
+{%- if const %} const{% endif -%}
+{%- if override %} override{% endif -%}
+{%- if default %} = default{% endif -%}
+{%- if deleted %} = deleted{% endif -%}
+{%- if pureVirtual %} = 0{% endif -%}
diff --git a/docs/doxybook/templates/synopsis_function_type_and_leading_specifiers.tmpl b/docs/doxybook/templates/synopsis_function_type_and_leading_specifiers.tmpl
new file mode 100644
index 000000000..5cde64d28
--- /dev/null
+++ b/docs/doxybook/templates/synopsis_function_type_and_leading_specifiers.tmpl
@@ -0,0 +1,6 @@
+{%- if default(virtual, false) or default(static, false) or default(explicit, false) or default(type, false) -%}
+  <span>{{ noop() -}}
+    {%- include "synopsis_indent.tmpl" -%}
+    {%- include "synopsis_type_and_leading_specifiers.tmpl" -%}
+  </span>{{ noop() -}}
+{%- endif -%}
diff --git a/docs/doxybook/templates/synopsis_indent.tmpl b/docs/doxybook/templates/synopsis_indent.tmpl
new file mode 100644
index 000000000..a2d7193a6
--- /dev/null
+++ b/docs/doxybook/templates/synopsis_indent.tmpl
@@ -0,0 +1,5 @@
+{%- if default(synopsis_indent_width, false) -%}
+  {%- for i in range(synopsis_indent_width) -%}
+    &nbsp;{{ noop() -}}
+  {%- endfor -%}
+{%- endif -%}
diff --git a/docs/doxybook/templates/synopsis_inherited_from.tmpl b/docs/doxybook/templates/synopsis_inherited_from.tmpl
new file mode 100644
index 000000000..fd88b649c
--- /dev/null
+++ b/docs/doxybook/templates/synopsis_inherited_from.tmpl
@@ -0,0 +1,4 @@
+{%- if default(synopsis_is_inherited, false) != false -%}
+  {%- set base.synopsis_indent_width = default(synopsis_indent_width, 0) -%}
+  {{- render("synopsis_inherited_from_comment.tmpl", base) -}}
+{%- endif -%}
diff --git a/docs/doxybook/templates/synopsis_inherited_from_comment.tmpl b/docs/doxybook/templates/synopsis_inherited_from_comment.tmpl
new file mode 100644
index 000000000..4afda1250
--- /dev/null
+++ b/docs/doxybook/templates/synopsis_inherited_from_comment.tmpl
@@ -0,0 +1,8 @@
+<span class="doxybook-comment">{{ noop() -}}
+  {%- if default(synopsis_indent_width, 0) != 0 -%}
+    <code>{%- include "synopsis_indent.tmpl" -%}</code>
+  {%- endif -%}
+  /* Inherited from <code>{{ noop() -}}
+    <b><a href="{{ url }}">{%- include "name_qualified.tmpl" -%}</a></b>{{ noop() -}}
+  </code> */{{ noop() -}}
+</span>{{ noop() -}}
diff --git a/docs/doxybook/templates/synopsis_initializer.tmpl b/docs/doxybook/templates/synopsis_initializer.tmpl
new file mode 100644
index 000000000..dd159979d
--- /dev/null
+++ b/docs/doxybook/templates/synopsis_initializer.tmpl
@@ -0,0 +1,3 @@
+{%- if kind == "using" %} = {{ escape(type) -}}
+{%- else if exists("initializer") %} {{ escape(initializer) -}}
+{%- endif -%}
diff --git a/docs/doxybook/templates/synopsis_initializer_abbreviated.tmpl b/docs/doxybook/templates/synopsis_initializer_abbreviated.tmpl
new file mode 100644
index 000000000..2bc4d4856
--- /dev/null
+++ b/docs/doxybook/templates/synopsis_initializer_abbreviated.tmpl
@@ -0,0 +1 @@
+{% if kind == "using" or exists("initializer") %} = <i>see below</i>{% endif -%}
diff --git a/docs/doxybook/templates/synopsis_kind.tmpl b/docs/doxybook/templates/synopsis_kind.tmpl
new file mode 100644
index 000000000..34cd602a9
--- /dev/null
+++ b/docs/doxybook/templates/synopsis_kind.tmpl
@@ -0,0 +1,9 @@
+{%- if kind == "interface" %}class {{ noop() -}}
+{%- else if kind == "namespace" %}namespace {{ noop() -}}
+{%- else if kind == "typedef" %}typedef {{ type -}}
+{%- else if kind == "enum" %}enum {% if strong %}class {% endif -%} {{ noop() -}}
+{%- else if kind == "friend" %}friend {{ noop() -}}
+  {%- if type == "class" or type == "struct" %}{{ type }} {% endif -%}
+{%- else if kind == "define" %}#define {{ noop() -}}
+{%- else %}{{ kind }} {{ noop() -}}
+{%- endif -%}
diff --git a/docs/doxybook/templates/synopsis_kind_abbreviated.tmpl b/docs/doxybook/templates/synopsis_kind_abbreviated.tmpl
new file mode 100644
index 000000000..881582773
--- /dev/null
+++ b/docs/doxybook/templates/synopsis_kind_abbreviated.tmpl
@@ -0,0 +1,9 @@
+{%- if kind == "interface" %}class {{ noop() -}}
+{%- else if kind == "namespace" %}namespace {{ noop() -}}
+{%- else if kind == "typedef" %}typedef <i>see below</i> {{ noop() -}}
+{%- else if kind == "enum" %}enum {% if strong %}class {% endif -%}
+{%- else if kind == "friend" %}friend {{ noop() -}}
+  {%- if type == "class" or type == "struct" %}{{type}} {% endif -%}
+{%- else if kind == "define" %}#define {{ noop() -}}
+{%- else %}{{ kind }} {{ noop() -}}
+{%- endif -%}
diff --git a/docs/doxybook/templates/synopsis_leading_line_break.tmpl b/docs/doxybook/templates/synopsis_leading_line_break.tmpl
new file mode 100644
index 000000000..13a1574e3
--- /dev/null
+++ b/docs/doxybook/templates/synopsis_leading_line_break.tmpl
@@ -0,0 +1,3 @@
+{%- if default(synopsis_needs_leading_line_break, false) -%}
+  <br>
+{%- endif -%}
diff --git a/docs/doxybook/templates/synopsis_macro.tmpl b/docs/doxybook/templates/synopsis_macro.tmpl
new file mode 100644
index 000000000..612773439
--- /dev/null
+++ b/docs/doxybook/templates/synopsis_macro.tmpl
@@ -0,0 +1,7 @@
+{%- include "synopsis_leading_line_break.tmpl" -%}
+{%- set child.synopsis_indent_width = default(synopsis_indent_width, 0) -%}
+<span>{{ noop() -}}
+  {{- render("synopsis_kind.tmpl", child) -}}
+  <b><a href="{{ child.url }}">{{- render("name_qualified.tmpl", child) -}}</a></b>{{ noop() -}}
+  {{- render("synopsis_initializer_abbreviated.tmpl", child) -}};{{ noop() -}}
+</span>
diff --git a/docs/doxybook/templates/synopsis_member_namespace_abbreviated.tmpl b/docs/doxybook/templates/synopsis_member_namespace_abbreviated.tmpl
new file mode 100644
index 000000000..682f615c9
--- /dev/null
+++ b/docs/doxybook/templates/synopsis_member_namespace_abbreviated.tmpl
@@ -0,0 +1,7 @@
+{%- include "synopsis_leading_line_break.tmpl" -%}
+{%- set child.synopsis_indent_width = default(synopsis_indent_width, 0) -%}
+{{- render("synopsis_brief.tmpl", child) -}}
+<span>{{ noop() -}}
+  {{- render("synopsis_kind_abbreviated.tmpl", child) -}}
+  <b><a href="{{ child.url }}">{{- render("name_qualified.tmpl", child) -}}</a></b> { <i>…</i> }{{ noop() -}}
+</span>
diff --git a/docs/doxybook/templates/synopsis_namespace_abbreviated.tmpl b/docs/doxybook/templates/synopsis_namespace_abbreviated.tmpl
new file mode 100644
index 000000000..682f615c9
--- /dev/null
+++ b/docs/doxybook/templates/synopsis_namespace_abbreviated.tmpl
@@ -0,0 +1,7 @@
+{%- include "synopsis_leading_line_break.tmpl" -%}
+{%- set child.synopsis_indent_width = default(synopsis_indent_width, 0) -%}
+{{- render("synopsis_brief.tmpl", child) -}}
+<span>{{ noop() -}}
+  {{- render("synopsis_kind_abbreviated.tmpl", child) -}}
+  <b><a href="{{ child.url }}">{{- render("name_qualified.tmpl", child) -}}</a></b> { <i>…</i> }{{ noop() -}}
+</span>
diff --git a/docs/doxybook/templates/synopsis_template_parameters.tmpl b/docs/doxybook/templates/synopsis_template_parameters.tmpl
new file mode 100644
index 000000000..4391c3d99
--- /dev/null
+++ b/docs/doxybook/templates/synopsis_template_parameters.tmpl
@@ -0,0 +1,14 @@
+{%- if exists("templateParams") -%}
+  <span>{% include "synopsis_indent.tmpl" -%}template &lt;{{ noop() -}}
+  {%- for param in templateParams -%}
+    {%- if not loop.is_first %}{% include "synopsis_indent.tmpl" -%}&nbsp;&nbsp;{% endif -%}
+    {{- param.type -}}
+    {%- if not isEmpty(param.name) %} {% endif -%}
+    {{- param.name -}}
+    {%- if existsIn(param, "defvalPlain") %} = {{ escape(param.defvalPlain) }}{% endif -%}
+    {%- if not loop.is_last -%}
+      ,</span>
+      {{- noop() }}<span>{% include "synopsis_indent.tmpl" -%}
+    {%- endif -%}
+  {%- endfor -%}&gt;</span>
+{%- endif -%}
diff --git a/docs/doxybook/templates/synopsis_type.tmpl b/docs/doxybook/templates/synopsis_type.tmpl
new file mode 100644
index 000000000..586555f08
--- /dev/null
+++ b/docs/doxybook/templates/synopsis_type.tmpl
@@ -0,0 +1,11 @@
+{%- include "synopsis_leading_line_break.tmpl" -%}
+{%- include "synopsis_inherited_from.tmpl" -%}
+{%- set child.synopsis_indent_width = default(synopsis_indent_width, 0) -%}
+{{- render("synopsis_brief.tmpl", child) -}}
+{{- render("synopsis_template_parameters.tmpl", child) -}}
+<span>{{ noop() -}}
+  {%- include "synopsis_indent.tmpl" -%}
+  {{- render("synopsis_kind_abbreviated.tmpl", child) -}}
+  <b><a href="{{ child.url }}">{% include "name.tmpl" %}</a></b>{{ noop() -}}
+  {{- render("synopsis_initializer_abbreviated.tmpl", child) -}};{{ noop() -}}
+</span>
diff --git a/docs/doxybook/templates/synopsis_type_and_leading_specifiers.tmpl b/docs/doxybook/templates/synopsis_type_and_leading_specifiers.tmpl
new file mode 100644
index 000000000..12136020f
--- /dev/null
+++ b/docs/doxybook/templates/synopsis_type_and_leading_specifiers.tmpl
@@ -0,0 +1,4 @@
+{%- if default(virtual, false) %}virtual {% endif -%}
+{%- if default(static, false) %}static {% endif -%}
+{%- if default(explicit, false) %}explicit {% endif -%}
+{%- if exists("type") %}{{ type }} {% endif -%}
diff --git a/docs/doxybook/templates/synopsis_variable.tmpl b/docs/doxybook/templates/synopsis_variable.tmpl
new file mode 100644
index 000000000..52c48da50
--- /dev/null
+++ b/docs/doxybook/templates/synopsis_variable.tmpl
@@ -0,0 +1,11 @@
+{%- include "synopsis_leading_line_break.tmpl" -%}
+{%- include "synopsis_inherited_from.tmpl" -%}
+{%- set child.synopsis_indent_width = default(synopsis_indent_width, 0) -%}
+{{- render("synopsis_brief.tmpl", child) -}}
+{{- render("synopsis_template_parameters.tmpl", child) -}}
+<span>{{ noop() -}}
+  {%- include "synopsis_indent.tmpl" -%}
+  {{- render("synopsis_type_and_leading_specifiers.tmpl", child) -}}
+  <b><a href="{{ child.url }}">{% include "name.tmpl" %}</a></b>{{ noop() -}}
+  {{- render("synopsis_initializer_abbreviated.tmpl", child) -}};{{ noop() -}}
+</span>
diff --git a/docs/doxybook/templates/table_header_brief.tmpl b/docs/doxybook/templates/table_header_brief.tmpl
new file mode 100644
index 000000000..ed13f970f
--- /dev/null
+++ b/docs/doxybook/templates/table_header_brief.tmpl
@@ -0,0 +1,2 @@
+| Name | Description |
+|------|-------------|
diff --git a/docs/doxybook/templates/table_header_enum.tmpl b/docs/doxybook/templates/table_header_enum.tmpl
new file mode 100644
index 000000000..cdf95bc6f
--- /dev/null
+++ b/docs/doxybook/templates/table_header_enum.tmpl
@@ -0,0 +1,2 @@
+| Enumerator | Value | Description |
+|------------|-------|-------------|
diff --git a/docs/doxybook/templates/table_row_brief.tmpl b/docs/doxybook/templates/table_row_brief.tmpl
new file mode 100644
index 000000000..1d599755f
--- /dev/null
+++ b/docs/doxybook/templates/table_row_brief.tmpl
@@ -0,0 +1 @@
+| **[`{{name}}`]({{url}})** | {% if exists("brief") %}{{brief}}{% endif %} |
diff --git a/docs/doxybook/templates/table_row_enum.tmpl b/docs/doxybook/templates/table_row_enum.tmpl
new file mode 100644
index 000000000..77c205be3
--- /dev/null
+++ b/docs/doxybook/templates/table_row_enum.tmpl
@@ -0,0 +1 @@
+| `{{ name }}` | {% if exists("initializer") -%}`{{ escape(replace(initializer, "= ", "")) }}`{%- endif %} | {% if exists("brief") -%}{{ brief }}{%- endif %} |
diff --git a/docs/doxybook/templates/title_kind.tmpl b/docs/doxybook/templates/title_kind.tmpl
new file mode 100644
index 000000000..100db2e84
--- /dev/null
+++ b/docs/doxybook/templates/title_kind.tmpl
@@ -0,0 +1,4 @@
+{%- if child.kind == "using" %}Type Alias{{ noop() -}}
+{%- else -%}{{ title(child.kind) -}}
+{%- endif -%}
+{%- if child.kind == "enum" and child.strong %} Class{%- endif -%}
diff --git a/docs/doxybook/templates/title_leading.tmpl b/docs/doxybook/templates/title_leading.tmpl
new file mode 100644
index 000000000..54eb7e967
--- /dev/null
+++ b/docs/doxybook/templates/title_leading.tmpl
@@ -0,0 +1,4 @@
+<h3 id="{{ child.kind }}-{{ safeAnchorId(child.name) }}">
+{%- if existsIn(child, "kind") and child.kind in ["class", "struct"] -%}
+  <a href="{{ child.url }}">{{ noop() -}}
+{%- endif -%}
diff --git a/docs/doxybook/templates/title_member.tmpl b/docs/doxybook/templates/title_member.tmpl
new file mode 100644
index 000000000..50e70f378
--- /dev/null
+++ b/docs/doxybook/templates/title_member.tmpl
@@ -0,0 +1,4 @@
+{%- include "title_leading.tmpl" -%}
+  {%- include "title_kind.tmpl" -%}
+  {{- noop() }} <code>{% include "name_qualified.tmpl" %}::{{ render("name_unqualified.tmpl", child) }}</code>
+{%- include "title_trailing.tmpl" -%}
diff --git a/docs/doxybook/templates/title_nonmember.tmpl b/docs/doxybook/templates/title_nonmember.tmpl
new file mode 100644
index 000000000..4ea9797fd
--- /dev/null
+++ b/docs/doxybook/templates/title_nonmember.tmpl
@@ -0,0 +1,5 @@
+{%- include "title_leading.tmpl" -%}
+  {%- include "title_kind.tmpl" -%}
+  {{- noop() }} <code>{{render("name_qualified.tmpl", child)}}</code>
+{%- include "title_trailing.tmpl" -%}
+
diff --git a/docs/doxybook/templates/title_trailing.tmpl b/docs/doxybook/templates/title_trailing.tmpl
new file mode 100644
index 000000000..fcc4f24e6
--- /dev/null
+++ b/docs/doxybook/templates/title_trailing.tmpl
@@ -0,0 +1,4 @@
+{%- if existsIn(child, "kind") and child.kind in ["class", "struct"] -%}
+  </a>
+{%- endif -%}
+</h3>
diff --git a/doc/thrust.dox b/docs/doxygen/config.dox
similarity index 82%
rename from doc/thrust.dox
rename to docs/doxygen/config.dox
index b74f436f5..7e06e3545 100644
--- a/doc/thrust.dox
+++ b/docs/doxygen/config.dox
@@ -1,4 +1,4 @@
-# Doxyfile 1.8.13
+# Doxyfile 1.9.3
 
 # This file describes the settings to be used by the documentation system
 # doxygen (www.doxygen.org) for a project.
@@ -17,11 +17,11 @@
 # Project related configuration options
 #---------------------------------------------------------------------------
 
-# This tag specifies the encoding used for all characters in the config file
-# that follow. The default is UTF-8 which is also the encoding used for all text
-# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv
-# built into libc) for the transcoding. See http://www.gnu.org/software/libiconv
-# for the list of possible encodings.
+# This tag specifies the encoding used for all characters in the configuration
+# file that follow. The default is UTF-8 which is also the encoding used for all
+# text before the first occurrence of this tag. Doxygen uses libiconv (or the
+# iconv built into libc) for the transcoding. See
+# https://www.gnu.org/software/libiconv/ for the list of possible encodings.
 # The default value is: UTF-8.
 
 DOXYFILE_ENCODING      = UTF-8
@@ -32,7 +32,7 @@ DOXYFILE_ENCODING      = UTF-8
 # title of most generated pages and in a few other places.
 # The default value is: My Project.
 
-PROJECT_NAME           = thrust
+PROJECT_NAME           = Thrust
 
 # The PROJECT_NUMBER tag can be used to enter a project or revision number. This
 # could be handy for archiving the generated documentation or if some version
@@ -58,7 +58,7 @@ PROJECT_LOGO           =
 # entered, it will be relative to the location where doxygen was started. If
 # left blank the current directory will be used.
 
-OUTPUT_DIRECTORY       = doc
+OUTPUT_DIRECTORY       =
 
 # If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub-
 # directories (in 2 levels) under the output directory of each output format and
@@ -187,7 +187,17 @@ SHORT_NAMES            = NO
 # description.)
 # The default value is: NO.
 
-JAVADOC_AUTOBRIEF      = NO
+JAVADOC_AUTOBRIEF      = YES
+
+# If the JAVADOC_BANNER tag is set to YES then doxygen will interpret a line
+# such as
+# /***************
+# as being the beginning of a Javadoc-style comment "banner". If set to NO, the
+# Javadoc-style will behave just like regular comments and it will not be
+# interpreted by doxygen.
+# The default value is: NO.
+
+JAVADOC_BANNER         = NO
 
 # If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first
 # line (until the first dot) of a Qt-style comment as the brief description. If
@@ -209,6 +219,14 @@ QT_AUTOBRIEF           = NO
 
 MULTILINE_CPP_IS_BRIEF = NO
 
+# By default Python docstrings are displayed as preformatted text and doxygen's
+# special commands cannot be used. By setting PYTHON_DOCSTRING to NO the
+# doxygen's special commands can be used and the contents of the docstring
+# documentation blocks is shown as doxygen documentation.
+# The default value is: YES.
+
+PYTHON_DOCSTRING       = YES
+
 # If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the
 # documentation from any documented member that it re-implements.
 # The default value is: YES.
@@ -220,7 +238,7 @@ INHERIT_DOCS           = YES
 # of the file/class/namespace that contains it.
 # The default value is: NO.
 
-SEPARATE_MEMBER_PAGES  = YES
+SEPARATE_MEMBER_PAGES  = NO
 
 # The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen
 # uses this value to replace tabs by spaces in code fragments.
@@ -232,20 +250,19 @@ TAB_SIZE               = 8
 # the documentation. An alias has the form:
 # name=value
 # For example adding
-# "sideeffect=@par Side Effects:\n"
+# "sideeffect=@par Side Effects:^^"
 # will allow you to put the command \sideeffect (or @sideeffect) in the
 # documentation, which will result in a user-defined paragraph with heading
-# "Side Effects:". You can put \n's in the value part of an alias to insert
-# newlines.
+# "Side Effects:". Note that you cannot put \n's in the value part of an alias
+# to insert newlines (in the resulting output). You can put ^^ in the value part
+# of an alias to insert a newline as if a physical newline was in the original
+# file. When you need a literal { or } or , in the value part of an alias you
+# have to escape them by means of a backslash (\), this can lead to conflicts
+# with the commands \{ and \} for these it is advised to use the version @{ and
+# @} or use a double escape (\\{ and \\})
 
 ALIASES                =
 
-# This tag can be used to specify a number of word-keyword mappings (TCL only).
-# A mapping has the form "name=value". For example adding "class=itcl::class"
-# will allow you to use the command class in the itcl::class meaning.
-
-TCL_SUBST              =
-
 # Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
 # only. Doxygen will then generate output that is more tailored for C. For
 # instance, some of the names that are used will be different. The list of all
@@ -274,28 +291,40 @@ OPTIMIZE_FOR_FORTRAN   = NO
 
 OPTIMIZE_OUTPUT_VHDL   = NO
 
+# Set the OPTIMIZE_OUTPUT_SLICE tag to YES if your project consists of Slice
+# sources only. Doxygen will then generate output that is more tailored for that
+# language. For instance, namespaces will be presented as modules, types will be
+# separated into more groups, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_SLICE  = NO
+
 # Doxygen selects the parser to use depending on the extension of the files it
 # parses. With this tag you can assign which parser to use for a given
 # extension. Doxygen has a built-in mapping, but you can override or extend it
 # using this tag. The format is ext=language, where ext is a file extension, and
-# language is one of the parsers supported by doxygen: IDL, Java, Javascript,
-# C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran:
-# FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran:
-# Fortran. In the later case the parser tries to guess whether the code is fixed
-# or free formatted code, this is the default for Fortran type files), VHDL. For
-# instance to make doxygen treat .inc files as Fortran files (default is PHP),
-# and .f files as C (default is Fortran), use: inc=Fortran f=C.
+# language is one of the parsers supported by doxygen: IDL, Java, JavaScript,
+# Csharp (C#), C, C++, Lex, D, PHP, md (Markdown), Objective-C, Python, Slice,
+# VHDL, Fortran (fixed format Fortran: FortranFixed, free formatted Fortran:
+# FortranFree, unknown formatted Fortran: Fortran. In the later case the parser
+# tries to guess whether the code is fixed or free formatted code, this is the
+# default for Fortran type files). For instance to make doxygen treat .inc files
+# as Fortran files (default is PHP), and .f files as C (default is Fortran),
+# use: inc=Fortran f=C.
 #
 # Note: For files without extension you can use no_extension as a placeholder.
 #
 # Note that for custom extensions you also need to set FILE_PATTERNS otherwise
-# the files are not read by doxygen.
+# the files are not read by doxygen. When specifying no_extension you should add
+# * to the FILE_PATTERNS.
+#
+# Note see also the list of default file extension mappings.
 
 EXTENSION_MAPPING      =
 
 # If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
 # according to the Markdown format, which allows for more readable
-# documentation. See http://daringfireball.net/projects/markdown/ for details.
+# documentation. See https://daringfireball.net/projects/markdown/ for details.
 # The output of markdown processing is further processed by doxygen, so you can
 # mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in
 # case of backward compatibilities issues.
@@ -307,7 +336,7 @@ MARKDOWN_SUPPORT       = YES
 # to that level are automatically included in the table of contents, even if
 # they do not have an id attribute.
 # Note: This feature currently applies only to Markdown headings.
-# Minimum value: 0, maximum value: 99, default value: 0.
+# Minimum value: 0, maximum value: 99, default value: 5.
 # This tag requires that the tag MARKDOWN_SUPPORT is set to YES.
 
 TOC_INCLUDE_HEADINGS   = 0
@@ -337,7 +366,7 @@ BUILTIN_STL_SUPPORT    = NO
 CPP_CLI_SUPPORT        = NO
 
 # Set the SIP_SUPPORT tag to YES if your project consists of sip (see:
-# http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen
+# https://www.riverbankcomputing.com/software/sip/intro) sources only. Doxygen
 # will parse them like normal C++ but will assume all classes use public instead
 # of private inheritance when no explicit protection keyword is present.
 # The default value is: NO.
@@ -423,6 +452,19 @@ TYPEDEF_HIDES_STRUCT   = NO
 
 LOOKUP_CACHE_SIZE      = 0
 
+# The NUM_PROC_THREADS specifies the number threads doxygen is allowed to use
+# during processing. When set to 0 doxygen will based this on the number of
+# cores available in the system. You can set it explicitly to a value larger
+# than 0 to get more control over the balance between CPU load and processing
+# speed. At this moment only the input processing can be done using multiple
+# threads. Since this is still an experimental feature the default is set to 1,
+# which effectively disables parallel processing. Please report any issues you
+# encounter. Generating dot graphs in parallel is controlled by the
+# DOT_NUM_THREADS setting.
+# Minimum value: 0, maximum value: 32, default value: 1.
+
+NUM_PROC_THREADS       = 1
+
 #---------------------------------------------------------------------------
 # Build related configuration options
 #---------------------------------------------------------------------------
@@ -443,6 +485,12 @@ EXTRACT_ALL            = NO
 
 EXTRACT_PRIVATE        = NO
 
+# If the EXTRACT_PRIV_VIRTUAL tag is set to YES, documented private virtual
+# methods of a class will be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PRIV_VIRTUAL   = NO
+
 # If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal
 # scope will be included in the documentation.
 # The default value is: NO.
@@ -480,6 +528,13 @@ EXTRACT_LOCAL_METHODS  = NO
 
 EXTRACT_ANON_NSPACES   = NO
 
+# If this flag is set to YES, the name of an unnamed parameter in a declaration
+# will be determined by the corresponding definition. By default unnamed
+# parameters remain unnamed in the output.
+# The default value is: YES.
+
+RESOLVE_UNNAMED_PARAMS = YES
+
 # If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
 # undocumented members inside documented classes or files. If set to NO these
 # members will be included in the various overviews, but no documentation
@@ -497,11 +552,11 @@ HIDE_UNDOC_MEMBERS     = NO
 HIDE_UNDOC_CLASSES     = YES
 
 # If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend
-# (class|struct|union) declarations. If set to NO, these declarations will be
-# included in the documentation.
+# declarations. If set to NO, these declarations will be included in the
+# documentation.
 # The default value is: NO.
 
-HIDE_FRIEND_COMPOUNDS  = NO
+HIDE_FRIEND_COMPOUNDS  = YES
 
 # If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any
 # documentation blocks found inside the body of a function. If set to NO, these
@@ -517,11 +572,18 @@ HIDE_IN_BODY_DOCS      = NO
 
 INTERNAL_DOCS          = NO
 
-# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file
-# names in lower-case letters. If set to YES, upper-case letters are also
-# allowed. This is useful if you have classes or files whose names only differ
-# in case and if your file system supports case sensitive file names. Windows
-# and Mac users are advised to set this option to NO.
+# With the correct setting of option CASE_SENSE_NAMES doxygen will better be
+# able to match the capabilities of the underlying filesystem. In case the
+# filesystem is case sensitive (i.e. it supports files in the same directory
+# whose names only differ in casing), the option must be set to YES to properly
+# deal with such files in case they appear in the input. For filesystems that
+# are not case sensitive the option should be be set to NO to properly deal with
+# output files written for symbols that only differ in casing, such as for two
+# classes, one named CLASS and the other named Class, and to also support
+# references to files without having to specify the exact matching casing. On
+# Windows (including Cygwin) and MacOS, users should typically set this option
+# to NO, whereas on Linux or other Unix flavors it should typically be set to
+# YES.
 # The default value is: system dependent.
 
 CASE_SENSE_NAMES       = YES
@@ -540,6 +602,12 @@ HIDE_SCOPE_NAMES       = NO
 
 HIDE_COMPOUND_REFERENCE= NO
 
+# If the SHOW_HEADERFILE tag is set to YES then the documentation for a class
+# will show which file needs to be included to use the class.
+# The default value is: YES.
+
+SHOW_HEADERFILE        = YES
+
 # If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of
 # the files that are included by a file in the documentation of that file.
 # The default value is: YES.
@@ -563,7 +631,7 @@ FORCE_LOCAL_INCLUDES   = NO
 # documentation for inline members.
 # The default value is: YES.
 
-INLINE_INFO            = YES
+INLINE_INFO            = NO
 
 # If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the
 # (detailed) documentation of file and class members alphabetically by member
@@ -666,21 +734,21 @@ MAX_INITIALIZER_LINES  = 30
 # list will mention the files that were used to generate the documentation.
 # The default value is: YES.
 
-SHOW_USED_FILES        = YES
+SHOW_USED_FILES        = NO
 
 # Set the SHOW_FILES tag to NO to disable the generation of the Files page. This
 # will remove the Files entry from the Quick Index and from the Folder Tree View
 # (if specified).
 # The default value is: YES.
 
-SHOW_FILES             = YES
+SHOW_FILES             = NO
 
 # Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces
 # page. This will remove the Namespaces entry from the Quick Index and from the
 # Folder Tree View (if specified).
 # The default value is: YES.
 
-SHOW_NAMESPACES        = YES
+SHOW_NAMESPACES        = NO
 
 # The FILE_VERSION_FILTER tag can be used to specify a program or script that
 # doxygen should invoke to get the current version for each file (typically from
@@ -697,7 +765,8 @@ FILE_VERSION_FILTER    =
 # output files in an output format independent way. To create the layout file
 # that represents doxygen's defaults, run doxygen with the -l option. You can
 # optionally specify a file name after the option, if omitted DoxygenLayout.xml
-# will be used as the name of the layout file.
+# will be used as the name of the layout file. See also section "Changing the
+# layout of pages" for information.
 #
 # Note that if you run doxygen from a directory containing a file called
 # DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
@@ -708,7 +777,7 @@ LAYOUT_FILE            =
 # The CITE_BIB_FILES tag can be used to specify one or more bib files containing
 # the reference definitions. This must be a list of .bib files. The .bib
 # extension is automatically appended if omitted. This requires the bibtex tool
-# to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info.
+# to be installed. See also https://en.wikipedia.org/wiki/BibTeX for more info.
 # For LaTeX the style of the bibliography can be controlled using
 # LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
 # search path. See also \cite for info how to create references.
@@ -743,23 +812,35 @@ WARNINGS               = YES
 WARN_IF_UNDOCUMENTED   = YES
 
 # If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for
-# potential errors in the documentation, such as not documenting some parameters
-# in a documented function, or documenting parameters that don't exist or using
-# markup commands wrongly.
+# potential errors in the documentation, such as documenting some parameters in
+# a documented function twice, or documenting parameters that don't exist or
+# using markup commands wrongly.
 # The default value is: YES.
 
 WARN_IF_DOC_ERROR      = YES
 
+# If WARN_IF_INCOMPLETE_DOC is set to YES, doxygen will warn about incomplete
+# function parameter documentation. If set to NO, doxygen will accept that some
+# parameters have no documentation without warning.
+# The default value is: YES.
+
+WARN_IF_INCOMPLETE_DOC = YES
+
 # This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that
 # are documented, but have no documentation for their parameters or return
-# value. If set to NO, doxygen will only warn about wrong or incomplete
-# parameter documentation, but not about the absence of documentation.
+# value. If set to NO, doxygen will only warn about wrong parameter
+# documentation, but not about the absence of documentation. If EXTRACT_ALL is
+# set to YES then this flag will automatically be disabled. See also
+# WARN_IF_INCOMPLETE_DOC
 # The default value is: NO.
 
 WARN_NO_PARAMDOC       = NO
 
 # If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when
-# a warning is encountered.
+# a warning is encountered. If the WARN_AS_ERROR tag is set to FAIL_ON_WARNINGS
+# then doxygen will continue running as if WARN_AS_ERROR tag is set to NO, but
+# at the end of the doxygen process doxygen will return with a non-zero status.
+# Possible values are: NO, YES and FAIL_ON_WARNINGS.
 # The default value is: NO.
 
 WARN_AS_ERROR          = NO
@@ -776,7 +857,10 @@ WARN_FORMAT            = "$file:$line: $text"
 
 # The WARN_LOGFILE tag can be used to specify a file to which warning and error
 # messages should be written. If left blank the output is written to standard
-# error (stderr).
+# error (stderr). In case the file specified cannot be opened for writing the
+# warning and error messages are written to standard error. When as file - is
+# specified the warning and error messages are written to standard output
+# (stdout).
 
 WARN_LOGFILE           =
 
@@ -790,14 +874,13 @@ WARN_LOGFILE           =
 # spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
 # Note: If this tag is empty the current directory is searched.
 
-INPUT                  = thrust \
-                         examples
+INPUT                  = thrust
 
 # This tag can be used to specify the character encoding of the source files
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
 # libiconv (or the iconv built into libc) for the transcoding. See the libiconv
-# documentation (see: http://www.gnu.org/software/libiconv) for the list of
-# possible encodings.
+# documentation (see:
+# https://www.gnu.org/software/libiconv/) for the list of possible encodings.
 # The default value is: UTF-8.
 
 INPUT_ENCODING         = UTF-8
@@ -810,11 +893,15 @@ INPUT_ENCODING         = UTF-8
 # need to set EXTENSION_MAPPING for the extension otherwise the files are not
 # read by doxygen.
 #
+# Note the list of default checked file patterns might differ from the list of
+# default file extension mappings.
+#
 # If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp,
 # *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h,
-# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc,
-# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.pyw, *.f90, *.f95, *.f03, *.f08,
-# *.f, *.for, *.tcl, *.vhd, *.vhdl, *.ucf and *.qsf.
+# *.hh, *.hxx, *.hpp, *.h++, *.l, *.cs, *.d, *.php, *.php4, *.php5, *.phtml,
+# *.inc, *.m, *.markdown, *.md, *.mm, *.dox (to be provided as doxygen C
+# comment), *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, *.f18, *.f, *.for, *.vhd,
+# *.vhdl, *.ucf, *.qsf and *.ice.
 
 FILE_PATTERNS          =
 
@@ -831,7 +918,7 @@ RECURSIVE              = YES
 # Note that relative paths are relative to the directory from which doxygen is
 # run.
 
-EXCLUDE                = examples
+EXCLUDE                =
 
 # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
 # directories that are symbolic links (a Unix file system feature) are excluded
@@ -847,13 +934,13 @@ EXCLUDE_SYMLINKS       = NO
 # Note that the wildcards are matched against the file with absolute path, so to
 # exclude all test directories for example use the pattern */test/*
 
-EXCLUDE_PATTERNS       = */detail/*
+EXCLUDE_PATTERNS       = *detail*
 
 # The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
 # (namespaces, classes, functions, etc.) that should be excluded from the
 # output. The symbol name can be a fully qualified name, a word, or if the
 # wildcard * is used, a substring. Examples: ANamespace, AClass,
-# AClass::ANamespace, ANamespace::*Test
+# ANamespace::AClass, ANamespace::*Test
 #
 # Note that the wildcards are matched against the file with absolute path, so to
 # exclude all test directories use the pattern */test/*
@@ -969,7 +1056,7 @@ INLINE_SOURCES         = NO
 STRIP_CODE_COMMENTS    = YES
 
 # If the REFERENCED_BY_RELATION tag is set to YES then for each documented
-# function all documented functions referencing it will be listed.
+# entity all documented functions referencing it will be listed.
 # The default value is: NO.
 
 REFERENCED_BY_RELATION = YES
@@ -1001,12 +1088,12 @@ SOURCE_TOOLTIPS        = YES
 # If the USE_HTAGS tag is set to YES then the references to source code will
 # point to the HTML generated by the htags(1) tool instead of doxygen built-in
 # source browser. The htags tool is part of GNU's global source tagging system
-# (see http://www.gnu.org/software/global/global.html). You will need version
+# (see https://www.gnu.org/software/global/global.html). You will need version
 # 4.8.6 or higher.
 #
 # To use it do the following:
 # - Install the latest version of global
-# - Enable SOURCE_BROWSER and USE_HTAGS in the config file
+# - Enable SOURCE_BROWSER and USE_HTAGS in the configuration file
 # - Make sure the INPUT points to the root of the source tree
 # - Run doxygen as normal
 #
@@ -1028,25 +1115,6 @@ USE_HTAGS              = NO
 
 VERBATIM_HEADERS       = YES
 
-# If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the
-# clang parser (see: http://clang.llvm.org/) for more accurate parsing at the
-# cost of reduced performance. This can be particularly helpful with template
-# rich C++ code for which doxygen's built-in parser lacks the necessary type
-# information.
-# Note: The availability of this option depends on whether or not doxygen was
-# generated with the -Duse-libclang=ON option for CMake.
-# The default value is: NO.
-
-CLANG_ASSISTED_PARSING = NO
-
-# If clang assisted parsing is enabled you can provide the compiler with command
-# line options that you would normally use when invoking the compiler. Note that
-# the include paths will already be set by doxygen for the files and directories
-# specified with INPUT and INCLUDE_PATH.
-# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES.
-
-CLANG_OPTIONS          =
-
 #---------------------------------------------------------------------------
 # Configuration options related to the alphabetical class index
 #---------------------------------------------------------------------------
@@ -1056,14 +1124,7 @@ CLANG_OPTIONS          =
 # classes, structs, unions or interfaces.
 # The default value is: YES.
 
-ALPHABETICAL_INDEX     = NO
-
-# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in
-# which the alphabetical index list will be split.
-# Minimum value: 1, maximum value: 20, default value: 5.
-# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
-
-COLS_IN_ALPHA_INDEX    = 5
+ALPHABETICAL_INDEX     = YES
 
 # In case all classes in a project start with a common prefix, all classes will
 # be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
@@ -1080,7 +1141,7 @@ IGNORE_PREFIX          =
 # If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output
 # The default value is: YES.
 
-GENERATE_HTML          = YES
+GENERATE_HTML          = NO
 
 # The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a
 # relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
@@ -1088,7 +1149,7 @@ GENERATE_HTML          = YES
 # The default directory is: html.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
-HTML_OUTPUT            = html
+HTML_OUTPUT            = build_docs/doxygen/html
 
 # The HTML_FILE_EXTENSION tag can be used to specify the file extension for each
 # generated HTML page (for example: .htm, .php, .asp).
@@ -1164,8 +1225,8 @@ HTML_EXTRA_FILES       =
 
 # The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
 # will adjust the colors in the style sheet and background images according to
-# this color. Hue is specified as an angle on a colorwheel, see
-# http://en.wikipedia.org/wiki/Hue for more information. For instance the value
+# this color. Hue is specified as an angle on a color-wheel, see
+# https://en.wikipedia.org/wiki/Hue for more information. For instance the value
 # 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
 # purple, and 360 is red again.
 # Minimum value: 0, maximum value: 359, default value: 220.
@@ -1174,7 +1235,7 @@ HTML_EXTRA_FILES       =
 HTML_COLORSTYLE_HUE    = 220
 
 # The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors
-# in the HTML output. For a value of 0 the output will use grayscales only. A
+# in the HTML output. For a value of 0 the output will use gray-scales only. A
 # value of 255 will produce the most vivid colors.
 # Minimum value: 0, maximum value: 255, default value: 100.
 # This tag requires that the tag GENERATE_HTML is set to YES.
@@ -1201,6 +1262,17 @@ HTML_COLORSTYLE_GAMMA  = 80
 
 HTML_TIMESTAMP         = NO
 
+# If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML
+# documentation will contain a main index with vertical navigation menus that
+# are dynamically created via JavaScript. If disabled, the navigation index will
+# consists of multiple levels of tabs that are statically embedded in every HTML
+# page. Disable this option to support browsers that do not have JavaScript,
+# like the Qt help browser.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_DYNAMIC_MENUS     = YES
+
 # If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
 # documentation will contain sections that can be hidden and shown after the
 # page has loaded.
@@ -1224,13 +1296,14 @@ HTML_INDEX_NUM_ENTRIES = 100
 
 # If the GENERATE_DOCSET tag is set to YES, additional index files will be
 # generated that can be used as input for Apple's Xcode 3 integrated development
-# environment (see: http://developer.apple.com/tools/xcode/), introduced with
-# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a
-# Makefile in the HTML output directory. Running make will produce the docset in
-# that directory and running make install will install the docset in
+# environment (see:
+# https://developer.apple.com/xcode/), introduced with OSX 10.5 (Leopard). To
+# create a documentation set, doxygen will generate a Makefile in the HTML
+# output directory. Running make will produce the docset in that directory and
+# running make install will install the docset in
 # ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
-# startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html
-# for more information.
+# startup. See https://developer.apple.com/library/archive/featuredarticles/Doxy
+# genXcode/_index.html for more information.
 # The default value is: NO.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
@@ -1244,6 +1317,13 @@ GENERATE_DOCSET        = NO
 
 DOCSET_FEEDNAME        = "Doxygen generated docs"
 
+# This tag determines the URL of the docset feed. A documentation feed provides
+# an umbrella under which multiple documentation sets from a single provider
+# (such as a company or product suite) can be grouped.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_FEEDURL         =
+
 # This tag specifies a string that should uniquely identify the documentation
 # set bundle. This should be a reverse domain-name style string, e.g.
 # com.mycompany.MyDocSet. Doxygen will append .docset to the name.
@@ -1269,8 +1349,12 @@ DOCSET_PUBLISHER_NAME  = Publisher
 # If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
 # additional HTML index files: index.hhp, index.hhc, and index.hhk. The
 # index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
-# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on
-# Windows.
+# on Windows. In the beginning of 2021 Microsoft took the original page, with
+# a.o. the download links, offline the HTML help workshop was already many years
+# in maintenance mode). You can download the HTML help workshop from the web
+# archives at Installation executable (see:
+# http://web.archive.org/web/20160201063255/http://download.microsoft.com/downlo
+# ad/0/A/9/0A939EF6-E31C-430F-A3DF-DFAE7960D564/htmlhelp.exe).
 #
 # The HTML Help Workshop contains a compiler that can convert all HTML output
 # generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
@@ -1300,7 +1384,7 @@ CHM_FILE               =
 HHC_LOCATION           =
 
 # The GENERATE_CHI flag controls if a separate .chi index file is generated
-# (YES) or that it should be included in the master .chm file (NO).
+# (YES) or that it should be included in the main .chm file (NO).
 # The default value is: NO.
 # This tag requires that the tag GENERATE_HTMLHELP is set to YES.
 
@@ -1345,7 +1429,8 @@ QCH_FILE               =
 
 # The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
 # Project output. For more information please see Qt Help Project / Namespace
-# (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace).
+# (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#namespace).
 # The default value is: org.doxygen.Project.
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
@@ -1353,8 +1438,8 @@ QHP_NAMESPACE          = org.doxygen.Project
 
 # The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
 # Help Project output. For more information please see Qt Help Project / Virtual
-# Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual-
-# folders).
+# Folders (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#virtual-folders).
 # The default value is: doc.
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
@@ -1362,30 +1447,30 @@ QHP_VIRTUAL_FOLDER     = doc
 
 # If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
 # filter to add. For more information please see Qt Help Project / Custom
-# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
-# filters).
+# Filters (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters).
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
 QHP_CUST_FILTER_NAME   =
 
 # The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
 # custom filter to add. For more information please see Qt Help Project / Custom
-# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
-# filters).
+# Filters (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters).
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
 QHP_CUST_FILTER_ATTRS  =
 
 # The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
 # project's filter section matches. Qt Help Project / Filter Attributes (see:
-# http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes).
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#filter-attributes).
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
 QHP_SECT_FILTER_ATTRS  =
 
-# The QHG_LOCATION tag can be used to specify the location of Qt's
-# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
-# generated .qhp file.
+# The QHG_LOCATION tag can be used to specify the location (absolute path
+# including file name) of Qt's qhelpgenerator. If non-empty doxygen will try to
+# run qhelpgenerator on the generated .qhp file.
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
 QHG_LOCATION           =
@@ -1419,7 +1504,7 @@ ECLIPSE_DOC_ID         = org.doxygen.Project
 # The default value is: NO.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
-DISABLE_INDEX          = NO
+DISABLE_INDEX          = YES
 
 # The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
 # structure should be generated to display hierarchical information. If the tag
@@ -1428,16 +1513,28 @@ DISABLE_INDEX          = NO
 # to work a browser that supports JavaScript, DHTML, CSS and frames is required
 # (i.e. any modern browser). Windows users are probably better off using the
 # HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can
-# further fine-tune the look of the index. As an example, the default style
-# sheet generated by doxygen has an example that shows how to put an image at
-# the root of the tree instead of the PROJECT_NAME. Since the tree basically has
-# the same information as the tab index, you could consider setting
-# DISABLE_INDEX to YES when enabling this option.
+# further fine tune the look of the index (see "Fine-tuning the output"). As an
+# example, the default style sheet generated by doxygen has an example that
+# shows how to put an image at the root of the tree instead of the PROJECT_NAME.
+# Since the tree basically has the same information as the tab index, you could
+# consider setting DISABLE_INDEX to YES when enabling this option.
 # The default value is: NO.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
 GENERATE_TREEVIEW      = NO
 
+# When both GENERATE_TREEVIEW and DISABLE_INDEX are set to YES, then the
+# FULL_SIDEBAR option determines if the side bar is limited to only the treeview
+# area (value NO) or if it should extend to the full height of the window (value
+# YES). Setting this to YES gives a layout similar to
+# https://docs.readthedocs.io with more room for contents, but less room for the
+# project logo, title, and description. If either GENERATE_TREEVIEW or
+# DISABLE_INDEX is set to NO, this option has no effect.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FULL_SIDEBAR           = NO
+
 # The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that
 # doxygen will group on one line in the generated HTML documentation.
 #
@@ -1462,6 +1559,17 @@ TREEVIEW_WIDTH         = 250
 
 EXT_LINKS_IN_WINDOW    = NO
 
+# If the HTML_FORMULA_FORMAT option is set to svg, doxygen will use the pdf2svg
+# tool (see https://github.com/dawbarton/pdf2svg) or inkscape (see
+# https://inkscape.org) to generate formulas as SVG images instead of PNGs for
+# the HTML output. These images will generally look nicer at scaled resolutions.
+# Possible values are: png (the default) and svg (looks nicer but requires the
+# pdf2svg or inkscape tool).
+# The default value is: png.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FORMULA_FORMAT    = png
+
 # Use this tag to change the font size of LaTeX formulas included as images in
 # the HTML documentation. When you change the font size after a successful
 # doxygen run you need to manually remove any form_*.png images from the HTML
@@ -1471,7 +1579,7 @@ EXT_LINKS_IN_WINDOW    = NO
 
 FORMULA_FONTSIZE       = 10
 
-# Use the FORMULA_TRANPARENT tag to determine whether or not the images
+# Use the FORMULA_TRANSPARENT tag to determine whether or not the images
 # generated for formulas are transparent PNGs. Transparent PNGs are not
 # supported properly for IE 6.0, but are supported on all modern browsers.
 #
@@ -1482,8 +1590,14 @@ FORMULA_FONTSIZE       = 10
 
 FORMULA_TRANSPARENT    = YES
 
+# The FORMULA_MACROFILE can contain LaTeX \newcommand and \renewcommand commands
+# to create new LaTeX commands to be used in formulas as building blocks. See
+# the section "Including formulas" for details.
+
+FORMULA_MACROFILE      =
+
 # Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
-# http://www.mathjax.org) which uses client side Javascript for the rendering
+# https://www.mathjax.org) which uses client side JavaScript for the rendering
 # instead of using pre-rendered bitmaps. Use this if you do not have LaTeX
 # installed or if you want to formulas look prettier in the HTML output. When
 # enabled you may also need to install MathJax separately and configure the path
@@ -1493,11 +1607,29 @@ FORMULA_TRANSPARENT    = YES
 
 USE_MATHJAX            = NO
 
+# With MATHJAX_VERSION it is possible to specify the MathJax version to be used.
+# Note that the different versions of MathJax have different requirements with
+# regards to the different settings, so it is possible that also other MathJax
+# settings have to be changed when switching between the different MathJax
+# versions.
+# Possible values are: MathJax_2 and MathJax_3.
+# The default value is: MathJax_2.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_VERSION        = MathJax_2
+
 # When MathJax is enabled you can set the default output format to be used for
-# the MathJax output. See the MathJax site (see:
-# http://docs.mathjax.org/en/latest/output.html) for more details.
+# the MathJax output. For more details about the output format see MathJax
+# version 2 (see:
+# http://docs.mathjax.org/en/v2.7-latest/output.html) and MathJax version 3
+# (see:
+# http://docs.mathjax.org/en/latest/web/components/output.html).
 # Possible values are: HTML-CSS (which is slower, but has the best
-# compatibility), NativeMML (i.e. MathML) and SVG.
+# compatibility. This is the name for Mathjax version 2, for MathJax version 3
+# this will be translated into chtml), NativeMML (i.e. MathML. Only supported
+# for NathJax 2. For MathJax version 3 chtml will be used instead.), chtml (This
+# is the name for Mathjax version 3, for MathJax version 2 this will be
+# translated into HTML-CSS) and SVG.
 # The default value is: HTML-CSS.
 # This tag requires that the tag USE_MATHJAX is set to YES.
 
@@ -1510,22 +1642,29 @@ MATHJAX_FORMAT         = HTML-CSS
 # MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
 # Content Delivery Network so you can quickly see the result without installing
 # MathJax. However, it is strongly recommended to install a local copy of
-# MathJax from http://www.mathjax.org before deployment.
-# The default value is: http://cdn.mathjax.org/mathjax/latest.
+# MathJax from https://www.mathjax.org before deployment. The default value is:
+# - in case of MathJax version 2: https://cdn.jsdelivr.net/npm/mathjax@2
+# - in case of MathJax version 3: https://cdn.jsdelivr.net/npm/mathjax@3
 # This tag requires that the tag USE_MATHJAX is set to YES.
 
 MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
 
 # The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax
 # extension names that should be enabled during MathJax rendering. For example
+# for MathJax version 2 (see https://docs.mathjax.org/en/v2.7-latest/tex.html
+# #tex-and-latex-extensions):
 # MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
+# For example for MathJax version 3 (see
+# http://docs.mathjax.org/en/latest/input/tex/extensions/index.html):
+# MATHJAX_EXTENSIONS = ams
 # This tag requires that the tag USE_MATHJAX is set to YES.
 
 MATHJAX_EXTENSIONS     =
 
 # The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
 # of code that will be used on startup of the MathJax code. See the MathJax site
-# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an
+# (see:
+# http://docs.mathjax.org/en/v2.7-latest/output.html) for more details. For an
 # example see the documentation.
 # This tag requires that the tag USE_MATHJAX is set to YES.
 
@@ -1553,7 +1692,7 @@ MATHJAX_CODEFILE       =
 SEARCHENGINE           = NO
 
 # When the SERVER_BASED_SEARCH tag is enabled the search engine will be
-# implemented using a web server instead of a web client using Javascript. There
+# implemented using a web server instead of a web client using JavaScript. There
 # are two flavors of web server based searching depending on the EXTERNAL_SEARCH
 # setting. When disabled, doxygen will generate a PHP script for searching and
 # an index file used by the script. When EXTERNAL_SEARCH is enabled the indexing
@@ -1572,7 +1711,8 @@ SERVER_BASED_SEARCH    = NO
 #
 # Doxygen ships with an example indexer (doxyindexer) and search engine
 # (doxysearch.cgi) which are based on the open source search engine library
-# Xapian (see: http://xapian.org/).
+# Xapian (see:
+# https://xapian.org/).
 #
 # See the section "External Indexing and Searching" for details.
 # The default value is: NO.
@@ -1585,8 +1725,9 @@ EXTERNAL_SEARCH        = NO
 #
 # Doxygen ships with an example indexer (doxyindexer) and search engine
 # (doxysearch.cgi) which are based on the open source search engine library
-# Xapian (see: http://xapian.org/). See the section "External Indexing and
-# Searching" for details.
+# Xapian (see:
+# https://xapian.org/). See the section "External Indexing and Searching" for
+# details.
 # This tag requires that the tag SEARCHENGINE is set to YES.
 
 SEARCHENGINE_URL       =
@@ -1637,21 +1778,35 @@ LATEX_OUTPUT           = latex
 # The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
 # invoked.
 #
-# Note that when enabling USE_PDFLATEX this option is only used for generating
-# bitmaps for formulas in the HTML output, but not in the Makefile that is
-# written to the output directory.
-# The default file is: latex.
+# Note that when not enabling USE_PDFLATEX the default is latex when enabling
+# USE_PDFLATEX the default is pdflatex and when in the later case latex is
+# chosen this is overwritten by pdflatex. For specific output languages the
+# default can have been set differently, this depends on the implementation of
+# the output language.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
-LATEX_CMD_NAME         = latex
+LATEX_CMD_NAME         =
 
 # The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate
 # index for LaTeX.
+# Note: This tag is used in the Makefile / make.bat.
+# See also: LATEX_MAKEINDEX_CMD for the part in the generated output file
+# (.tex).
 # The default file is: makeindex.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
 MAKEINDEX_CMD_NAME     = makeindex
 
+# The LATEX_MAKEINDEX_CMD tag can be used to specify the command name to
+# generate index for LaTeX. In case there is no backslash (\) as first character
+# it will be automatically added in the LaTeX code.
+# Note: This tag is used in the generated output file (.tex).
+# See also: MAKEINDEX_CMD_NAME for the part in the Makefile / make.bat.
+# The default value is: makeindex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_MAKEINDEX_CMD    = makeindex
+
 # If the COMPACT_LATEX tag is set to YES, doxygen generates more compact LaTeX
 # documents. This may be useful for small projects and may help to save some
 # trees in general.
@@ -1667,7 +1822,7 @@ COMPACT_LATEX          = NO
 # The default value is: a4.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
-PAPER_TYPE             = a4wide
+PAPER_TYPE             = a4
 
 # The EXTRA_PACKAGES tag can be used to specify one or more LaTeX package names
 # that should be included in the LaTeX output. The package can be specified just
@@ -1681,29 +1836,31 @@ PAPER_TYPE             = a4wide
 
 EXTRA_PACKAGES         =
 
-# The LATEX_HEADER tag can be used to specify a personal LaTeX header for the
-# generated LaTeX document. The header should contain everything until the first
-# chapter. If it is left blank doxygen will generate a standard header. See
-# section "Doxygen usage" for information on how to let doxygen write the
-# default header to a separate file.
+# The LATEX_HEADER tag can be used to specify a user-defined LaTeX header for
+# the generated LaTeX document. The header should contain everything until the
+# first chapter. If it is left blank doxygen will generate a standard header. It
+# is highly recommended to start with a default header using
+# doxygen -w latex new_header.tex new_footer.tex new_stylesheet.sty
+# and then modify the file new_header.tex. See also section "Doxygen usage" for
+# information on how to generate the default header that doxygen normally uses.
 #
-# Note: Only use a user-defined header if you know what you are doing! The
-# following commands have a special meaning inside the header: $title,
-# $datetime, $date, $doxygenversion, $projectname, $projectnumber,
-# $projectbrief, $projectlogo. Doxygen will replace $title with the empty
-# string, for the replacement values of the other commands the user is referred
-# to HTML_HEADER.
+# Note: Only use a user-defined header if you know what you are doing!
+# Note: The header is subject to change so you typically have to regenerate the
+# default header when upgrading to a newer version of doxygen. The following
+# commands have a special meaning inside the header (and footer): For a
+# description of the possible markers and block names see the documentation.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
 LATEX_HEADER           =
 
-# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the
-# generated LaTeX document. The footer should contain everything after the last
-# chapter. If it is left blank doxygen will generate a standard footer. See
+# The LATEX_FOOTER tag can be used to specify a user-defined LaTeX footer for
+# the generated LaTeX document. The footer should contain everything after the
+# last chapter. If it is left blank doxygen will generate a standard footer. See
 # LATEX_HEADER for more information on how to generate a default footer and what
-# special commands can be used inside the footer.
-#
-# Note: Only use a user-defined footer if you know what you are doing!
+# special commands can be used inside the footer. See also section "Doxygen
+# usage" for information on how to generate the default footer that doxygen
+# normally uses. Note: Only use a user-defined footer if you know what you are
+# doing!
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
 LATEX_FOOTER           =
@@ -1734,20 +1891,21 @@ LATEX_EXTRA_FILES      =
 # The default value is: YES.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
-PDF_HYPERLINKS         = NO
+PDF_HYPERLINKS         = YES
 
-# If the USE_PDFLATEX tag is set to YES, doxygen will use pdflatex to generate
-# the PDF file directly from the LaTeX files. Set this option to YES, to get a
-# higher quality PDF documentation.
+# If the USE_PDFLATEX tag is set to YES, doxygen will use the engine as
+# specified with LATEX_CMD_NAME to generate the PDF file directly from the LaTeX
+# files. Set this option to YES, to get a higher quality PDF documentation.
+#
+# See also section LATEX_CMD_NAME for selecting the engine.
 # The default value is: YES.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
-USE_PDFLATEX           = NO
+USE_PDFLATEX           = YES
 
 # If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \batchmode
 # command to the generated LaTeX files. This will instruct LaTeX to keep running
-# if errors occur, instead of asking the user for help. This option is also used
-# when generating formulas in HTML.
+# if errors occur, instead of asking the user for help.
 # The default value is: NO.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
@@ -1760,19 +1918,9 @@ LATEX_BATCHMODE        = NO
 
 LATEX_HIDE_INDICES     = NO
 
-# If the LATEX_SOURCE_CODE tag is set to YES then doxygen will include source
-# code with syntax highlighting in the LaTeX output.
-#
-# Note that which sources are shown also depends on other settings such as
-# SOURCE_BROWSER.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_SOURCE_CODE      = NO
-
 # The LATEX_BIB_STYLE tag can be used to specify the style to use for the
 # bibliography, e.g. plainnat, or ieeetr. See
-# http://en.wikipedia.org/wiki/BibTeX and \cite for more info.
+# https://en.wikipedia.org/wiki/BibTeX and \cite for more info.
 # The default value is: plain.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
@@ -1786,6 +1934,14 @@ LATEX_BIB_STYLE        = plain
 
 LATEX_TIMESTAMP        = NO
 
+# The LATEX_EMOJI_DIRECTORY tag is used to specify the (relative or absolute)
+# path from which the emoji images will be read. If a relative path is entered,
+# it will be relative to the LATEX_OUTPUT directory. If left blank the
+# LATEX_OUTPUT directory will be used.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EMOJI_DIRECTORY  =
+
 #---------------------------------------------------------------------------
 # Configuration options related to the RTF output
 #---------------------------------------------------------------------------
@@ -1825,9 +1981,9 @@ COMPACT_RTF            = NO
 
 RTF_HYPERLINKS         = NO
 
-# Load stylesheet definitions from file. Syntax is similar to doxygen's config
-# file, i.e. a series of assignments. You only have to provide replacements,
-# missing definitions are set to their default value.
+# Load stylesheet definitions from file. Syntax is similar to doxygen's
+# configuration file, i.e. a series of assignments. You only have to provide
+# replacements, missing definitions are set to their default value.
 #
 # See also section "Doxygen usage" for information on how to generate the
 # default style sheet that doxygen normally uses.
@@ -1836,22 +1992,12 @@ RTF_HYPERLINKS         = NO
 RTF_STYLESHEET_FILE    =
 
 # Set optional variables used in the generation of an RTF document. Syntax is
-# similar to doxygen's config file. A template extensions file can be generated
-# using doxygen -e rtf extensionFile.
+# similar to doxygen's configuration file. A template extensions file can be
+# generated using doxygen -e rtf extensionFile.
 # This tag requires that the tag GENERATE_RTF is set to YES.
 
 RTF_EXTENSIONS_FILE    =
 
-# If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code
-# with syntax highlighting in the RTF output.
-#
-# Note that which sources are shown also depends on other settings such as
-# SOURCE_BROWSER.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_SOURCE_CODE        = NO
-
 #---------------------------------------------------------------------------
 # Configuration options related to the man page output
 #---------------------------------------------------------------------------
@@ -1904,7 +2050,7 @@ MAN_LINKS              = NO
 # captures the structure of the code including all documentation.
 # The default value is: NO.
 
-GENERATE_XML           = NO
+GENERATE_XML           = YES
 
 # The XML_OUTPUT tag is used to specify where the XML pages will be put. If a
 # relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
@@ -1912,7 +2058,7 @@ GENERATE_XML           = NO
 # The default directory is: xml.
 # This tag requires that the tag GENERATE_XML is set to YES.
 
-XML_OUTPUT             = xml
+XML_OUTPUT             = build_docs/doxygen/xml
 
 # If the XML_PROGRAMLISTING tag is set to YES, doxygen will dump the program
 # listings (including syntax highlighting and cross-referencing information) to
@@ -1923,6 +2069,13 @@ XML_OUTPUT             = xml
 
 XML_PROGRAMLISTING     = YES
 
+# If the XML_NS_MEMB_FILE_SCOPE tag is set to YES, doxygen will include
+# namespace members in file scope as well, matching the HTML output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_NS_MEMB_FILE_SCOPE = NO
+
 #---------------------------------------------------------------------------
 # Configuration options related to the DOCBOOK output
 #---------------------------------------------------------------------------
@@ -1941,23 +2094,14 @@ GENERATE_DOCBOOK       = NO
 
 DOCBOOK_OUTPUT         = docbook
 
-# If the DOCBOOK_PROGRAMLISTING tag is set to YES, doxygen will include the
-# program listings (including syntax highlighting and cross-referencing
-# information) to the DOCBOOK output. Note that enabling this will significantly
-# increase the size of the DOCBOOK output.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
-
-DOCBOOK_PROGRAMLISTING = NO
-
 #---------------------------------------------------------------------------
 # Configuration options for the AutoGen Definitions output
 #---------------------------------------------------------------------------
 
 # If the GENERATE_AUTOGEN_DEF tag is set to YES, doxygen will generate an
-# AutoGen Definitions (see http://autogen.sf.net) file that captures the
-# structure of the code including all documentation. Note that this feature is
-# still experimental and incomplete at the moment.
+# AutoGen Definitions (see http://autogen.sourceforge.net/) file that captures
+# the structure of the code including all documentation. Note that this feature
+# is still experimental and incomplete at the moment.
 # The default value is: NO.
 
 GENERATE_AUTOGEN_DEF   = NO
@@ -2057,15 +2201,12 @@ INCLUDE_FILE_PATTERNS  =
 # recursively expanded use the := operator instead of the = operator.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
-PREDEFINED             = THRUST_NOEXCEPT=noexcept \
-                         "THRUST_DEFAULT={}" \
-                         "THRUST_NODISCARD=[[nodiscard]]" \
-                         "THRUST_MR_DEFAULT_ALIGNMENT=alignof(max_align_t)" \
-                         "THRUST_FINAL=final" \
-                         "THRUST_OVERRIDE=" \
-                         "THRUST_BEGIN_NS=namespace thrust {" \
-                         "THRUST_END_NS=}" \
-                         "cuda_cub=system::cuda"
+PREDEFINED             = THRUST_DOXYGEN \
+                         THRUST_CPP_DIALECT=2017 \
+                         THRUST_NODISCARD=[[nodiscard]] \
+                         THRUST_MR_DEFAULT_ALIGNMENT=alignof(std::max_align_t) \
+                         "THRUST_NAMESPACE_BEGIN=namespace thrust {" \
+                         THRUST_NAMESPACE_END=}
 
 # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
 # tag can be used to specify a list of macro names that should be expanded. The
@@ -2130,36 +2271,12 @@ EXTERNAL_GROUPS        = YES
 # be listed.
 # The default value is: YES.
 
-EXTERNAL_PAGES         = YES
-
-# The PERL_PATH should be the absolute path and name of the perl script
-# interpreter (i.e. the result of 'which perl').
-# The default file (with absolute path) is: /usr/bin/perl.
-
-PERL_PATH              = /usr/bin/perl
+EXTERNAL_PAGES         = NO
 
 #---------------------------------------------------------------------------
 # Configuration options related to the dot tool
 #---------------------------------------------------------------------------
 
-# If the CLASS_DIAGRAMS tag is set to YES, doxygen will generate a class diagram
-# (in HTML and LaTeX) for classes with base or super classes. Setting the tag to
-# NO turns the diagrams off. Note that this option also works with HAVE_DOT
-# disabled, but it is recommended to install and use dot, since it yields more
-# powerful graphs.
-# The default value is: YES.
-
-CLASS_DIAGRAMS         = YES
-
-# You can define message sequence charts within doxygen comments using the \msc
-# command. Doxygen will then run the mscgen tool (see:
-# http://www.mcternan.me.uk/mscgen/)) to produce the chart and insert it in the
-# documentation. The MSCGEN_PATH tag allows you to specify the directory where
-# the mscgen tool resides. If left empty the tool is assumed to be found in the
-# default search path.
-
-MSCGEN_PATH            =
-
 # You can include diagrams made with dia in doxygen documentation. Doxygen will
 # then run dia to produce the diagram and insert it in the documentation. The
 # DIA_PATH tag allows you to specify the directory where the dia binary resides.
@@ -2178,7 +2295,7 @@ HIDE_UNDOC_RELATIONS   = YES
 # http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
 # Bell Labs. The other options in this section have no effect if this option is
 # set to NO
-# The default value is: YES.
+# The default value is: NO.
 
 HAVE_DOT               = NO
 
@@ -2216,13 +2333,16 @@ DOT_FONTSIZE           = 10
 
 DOT_FONTPATH           =
 
-# If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for
-# each documented class showing the direct and indirect inheritance relations.
-# Setting this tag to YES will force the CLASS_DIAGRAMS tag to NO.
+# If the CLASS_GRAPH tag is set to YES (or GRAPH) then doxygen will generate a
+# graph for each documented class showing the direct and indirect inheritance
+# relations. In case HAVE_DOT is set as well dot will be used to draw the graph,
+# otherwise the built-in generator will be used. If the CLASS_GRAPH tag is set
+# to TEXT the direct and indirect inheritance relations will be shown as texts /
+# links.
+# Possible values are: NO, YES, TEXT and GRAPH.
 # The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
 
-CLASS_GRAPH            = YES
+CLASS_GRAPH            = NO
 
 # If the COLLABORATION_GRAPH tag is set to YES then doxygen will generate a
 # graph for each documented class showing the direct and indirect implementation
@@ -2231,14 +2351,14 @@ CLASS_GRAPH            = YES
 # The default value is: YES.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
-COLLABORATION_GRAPH    = YES
+COLLABORATION_GRAPH    = NO
 
 # If the GROUP_GRAPHS tag is set to YES then doxygen will generate a graph for
 # groups, showing the direct groups dependencies.
 # The default value is: YES.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
-GROUP_GRAPHS           = YES
+GROUP_GRAPHS           = NO
 
 # If the UML_LOOK tag is set to YES, doxygen will generate inheritance and
 # collaboration diagrams in a style similar to the OMG's Unified Modeling
@@ -2257,10 +2377,32 @@ UML_LOOK               = NO
 # but if the number exceeds 15, the total amount of fields shown is limited to
 # 10.
 # Minimum value: 0, maximum value: 100, default value: 10.
-# This tag requires that the tag HAVE_DOT is set to YES.
+# This tag requires that the tag UML_LOOK is set to YES.
 
 UML_LIMIT_NUM_FIELDS   = 10
 
+# If the DOT_UML_DETAILS tag is set to NO, doxygen will show attributes and
+# methods without types and arguments in the UML graphs. If the DOT_UML_DETAILS
+# tag is set to YES, doxygen will add type and arguments for attributes and
+# methods in the UML graphs. If the DOT_UML_DETAILS tag is set to NONE, doxygen
+# will not generate fields with class member information in the UML graphs. The
+# class diagrams will look similar to the default class diagrams but using UML
+# notation for the relationships.
+# Possible values are: NO, YES and NONE.
+# The default value is: NO.
+# This tag requires that the tag UML_LOOK is set to YES.
+
+DOT_UML_DETAILS        = NO
+
+# The DOT_WRAP_THRESHOLD tag can be used to set the maximum number of characters
+# to display on a single line. If the actual line length exceeds this threshold
+# significantly it will wrapped across multiple lines. Some heuristics are apply
+# to avoid ugly line breaks.
+# Minimum value: 0, maximum value: 1000, default value: 17.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_WRAP_THRESHOLD     = 17
+
 # If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and
 # collaboration graphs will show the relations between templates and their
 # instances.
@@ -2276,7 +2418,7 @@ TEMPLATE_RELATIONS     = NO
 # The default value is: YES.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
-INCLUDE_GRAPH          = YES
+INCLUDE_GRAPH          = NO
 
 # If the INCLUDED_BY_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are
 # set to YES then doxygen will generate a graph for each documented file showing
@@ -2285,7 +2427,7 @@ INCLUDE_GRAPH          = YES
 # The default value is: YES.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
-INCLUDED_BY_GRAPH      = YES
+INCLUDED_BY_GRAPH      = NO
 
 # If the CALL_GRAPH tag is set to YES then doxygen will generate a call
 # dependency graph for every global function or class method.
@@ -2316,7 +2458,7 @@ CALLER_GRAPH           = NO
 # The default value is: YES.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
-GRAPHICAL_HIERARCHY    = YES
+GRAPHICAL_HIERARCHY    = NO
 
 # If the DIRECTORY_GRAPH tag is set to YES then doxygen will show the
 # dependencies a directory has on other directories in a graphical way. The
@@ -2325,7 +2467,14 @@ GRAPHICAL_HIERARCHY    = YES
 # The default value is: YES.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
-DIRECTORY_GRAPH        = YES
+DIRECTORY_GRAPH        = NO
+
+# The DIR_GRAPH_MAX_DEPTH tag can be used to limit the maximum number of levels
+# of child directories generated in directory dependency graphs by dot.
+# Minimum value: 1, maximum value: 25, default value: 1.
+# This tag requires that the tag DIRECTORY_GRAPH is set to YES.
+
+DIR_GRAPH_MAX_DEPTH    = 1
 
 # The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
 # generated by dot. For an explanation of the image formats see the section
@@ -2334,9 +2483,7 @@ DIRECTORY_GRAPH        = YES
 # Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order
 # to make the SVG files visible in IE 9+ (other browsers do not have this
 # requirement).
-# Possible values are: png, png:cairo, png:cairo:cairo, png:cairo:gd, png:gd,
-# png:gd:gd, jpg, jpg:cairo, jpg:cairo:gd, jpg:gd, jpg:gd:gd, gif, gif:cairo,
-# gif:cairo:gd, gif:gd, gif:gd:gd, svg, png:gd, png:gd:gd, png:cairo,
+# Possible values are: png, jpg, gif, svg, png:gd, png:gd:gd, png:cairo,
 # png:cairo:gd, png:cairo:cairo, png:cairo:gdiplus, png:gdiplus and
 # png:gdiplus:gdiplus.
 # The default value is: png.
@@ -2382,10 +2529,10 @@ MSCFILE_DIRS           =
 DIAFILE_DIRS           =
 
 # When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the
-# path where java can find the plantuml.jar file. If left blank, it is assumed
-# PlantUML is not used or called during a preprocessing step. Doxygen will
-# generate a warning when it encounters a \startuml command in this case and
-# will not generate output for the diagram.
+# path where java can find the plantuml.jar file or to the filename of jar file
+# to be used. If left blank, it is assumed PlantUML is not used or called during
+# a preprocessing step. Doxygen will generate a warning when it encounters a
+# \startuml command in this case and will not generate output for the diagram.
 
 PLANTUML_JAR_PATH      =
 
@@ -2447,14 +2594,18 @@ DOT_MULTI_TARGETS      = NO
 # If the GENERATE_LEGEND tag is set to YES doxygen will generate a legend page
 # explaining the meaning of the various boxes and arrows in the dot generated
 # graphs.
+# Note: This tag requires that UML_LOOK isn't set, i.e. the doxygen internal
+# graphical representation for inheritance and collaboration diagrams is used.
 # The default value is: YES.
 # This tag requires that the tag HAVE_DOT is set to YES.
 
 GENERATE_LEGEND        = YES
 
-# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate dot
+# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate
 # files that are used to generate the various graphs.
+#
+# Note: This setting is not only used for dot files but also for msc temporary
+# files.
 # The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
 
 DOT_CLEANUP            = YES
diff --git a/docs/generate_markdown.bash b/docs/generate_markdown.bash
new file mode 100755
index 000000000..3b711db10
--- /dev/null
+++ b/docs/generate_markdown.bash
@@ -0,0 +1,106 @@
+#! /usr/bin/env bash
+
+###############################################################################
+# Copyright (c) 2018-2021 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+###############################################################################
+
+set -e
+
+function usage {
+  echo "Usage: ${0} [flags...]"
+  echo
+  echo "Generate Thrust documentation markdown with Doxygen and Doxybook that "
+  echo "can be served with Jekyll."
+  echo
+  echo "-h, -help, --help"
+  echo "  Print this message."
+  echo
+  echo "-c, --clean"
+  echo "  Delete the all existing build artifacts before generating the "
+  echo "  markdown."
+
+  exit -3
+}
+
+LOCAL=0
+CLEAN=0
+
+while test ${#} != 0
+do
+  case "${1}" in
+  -h) ;&
+  -help) ;&
+  --help) usage ;;
+  -c) ;&
+  --clean) CLEAN=1 ;;
+  esac
+  shift
+done
+
+SCRIPT_PATH=$(cd $(dirname ${0}); pwd -P)
+
+REPO_PATH=${SCRIPT_PATH}/..
+
+BUILD_DOCS_PATH=build_docs
+BUILD_DOXYGEN_PATH=${BUILD_DOCS_PATH}/doxygen
+BUILD_GITHUB_PAGES_PATH=${BUILD_DOCS_PATH}/github_pages
+
+cd ${REPO_PATH}
+
+if [[ "${CLEAN}" == 1 ]]; then
+  rm -rf ${BUILD_DOXYGEN_PATH}
+  rm -rf ${BUILD_GITHUB_PAGES_PATH}
+fi
+
+mkdir -p ${BUILD_DOXYGEN_PATH}/xml
+mkdir -p ${BUILD_GITHUB_PAGES_PATH}
+mkdir -p ${BUILD_GITHUB_PAGES_PATH}/api
+mkdir -p ${BUILD_GITHUB_PAGES_PATH}/contributing
+mkdir -p ${BUILD_GITHUB_PAGES_PATH}/releases
+
+# Copy all the documentation sources and Jekyll configuration into
+# `{BUILD_GITHUB_PAGES_PATH}`.
+cp -ur docs/github_pages/* ${BUILD_GITHUB_PAGES_PATH}/
+cp README.md               ${BUILD_GITHUB_PAGES_PATH}/overview.md
+cp CODE_OF_CONDUCT.md      ${BUILD_GITHUB_PAGES_PATH}/contributing/code_of_conduct.md
+cp CHANGELOG.md            ${BUILD_GITHUB_PAGES_PATH}/releases/changelog.md
+
+doxygen docs/doxygen/config.dox
+
+# `--debug-templates` will cause JSON output to be generated, which is useful
+# for debugging.
+doxybook2 --config docs/doxybook/config.json  \
+          --templates docs/doxybook/templates \
+          --debug-templates                   \
+          --input ${BUILD_DOXYGEN_PATH}/xml   \
+          --output ${BUILD_GITHUB_PAGES_PATH}/api
+
+# Doxygen and Doxybook don't give us a way to disable all the things we'd like,
+# so it's important to purge Doxybook Markdown output that we don't need:
+# 0) We want our Jekyll build to be as fast as possible and avoid wasting time
+#    on stuff we don't need.
+# 1) We don't want content that we don't plan to use to either show up on the
+#    site index or appear in search results.
+rm -rf ${BUILD_GITHUB_PAGES_PATH}/api/files
+rm -rf ${BUILD_GITHUB_PAGES_PATH}/api/index_files.md
+rm -rf ${BUILD_GITHUB_PAGES_PATH}/api/pages
+rm -rf ${BUILD_GITHUB_PAGES_PATH}/api/index_pages.md
+rm -rf ${BUILD_GITHUB_PAGES_PATH}/api/examples
+rm -rf ${BUILD_GITHUB_PAGES_PATH}/api/index_examples.md
+rm -rf ${BUILD_GITHUB_PAGES_PATH}/api/images
+rm -rf ${BUILD_GITHUB_PAGES_PATH}/api/index_namespaces.md
+rm -rf ${BUILD_GITHUB_PAGES_PATH}/api/index_groups.md
+rm -rf ${BUILD_GITHUB_PAGES_PATH}/api/index_classes.md
+
diff --git a/docs/github_pages/Gemfile b/docs/github_pages/Gemfile
new file mode 100644
index 000000000..09d948e17
--- /dev/null
+++ b/docs/github_pages/Gemfile
@@ -0,0 +1,10 @@
+source "https://rubygems.org"
+gem "just-the-docs"
+group :jekyll_plugins do
+  gem "github-pages"                 # GitHub Pages.
+  gem "jekyll-optional-front-matter" # GitHub Pages.
+  gem "jekyll-default-layout"        # GitHub Pages.
+  gem "jekyll-titles-from-headings"  # GitHub Pages.
+  gem "jekyll-relative-links"        # GitHub Pages.
+  gem "jekyll-include-cache"
+end
diff --git a/docs/github_pages/_config.yml b/docs/github_pages/_config.yml
new file mode 100644
index 000000000..c131e84fb
--- /dev/null
+++ b/docs/github_pages/_config.yml
@@ -0,0 +1,47 @@
+title: Thrust
+
+repository: nvidia/thrust
+
+remote_theme: pmarsceill/just-the-docs
+
+color_scheme: nvidia
+logo: /assets/images/nvidia_logo.png
+
+search_enabled: true
+search.heading_level: 4
+
+incremental: true
+
+# just-the-docs ignores these filenames by default.
+include: [ "contributing.md", "code_of_conduct.md" ]
+
+exclude: [ "node_modules", "doxybook_templates",
+           "generate_markdown.bash", "serve_docs_locally.bash" ]
+
+plugins:
+  - jekyll-optional-front-matter # GitHub Pages.
+  - jekyll-default-layout        # GitHub Pages.
+  - jekyll-titles-from-headings  # GitHub Pages.
+  - jekyll-relative-links        # GitHub Pages.
+  - jekyll-include-cache
+
+defaults:
+  -
+    scope:
+      path: overview.md
+    values:
+      title: Overview
+      nav_order: 0
+      permalink: /
+  -
+    scope:
+      path: contributing/code_of_conduct.md
+    values:
+      parent: Contributing
+      nav_order: 2
+  -
+    scope:
+      path: releases/changelog.md
+    values:
+      parent: Releases
+      nav_order: 0
diff --git a/docs/github_pages/_sass/color_schemes/nvidia.scss b/docs/github_pages/_sass/color_schemes/nvidia.scss
new file mode 100644
index 000000000..4b44fa222
--- /dev/null
+++ b/docs/github_pages/_sass/color_schemes/nvidia.scss
@@ -0,0 +1,145 @@
+$body-line-height: 1.4;
+$content-line-height: 1.4;
+.highlight { line-height: 1.0 !important; }
+
+/* h1 size. We make this smaller so the README title fits on one line. */
+$font-size-9: 30px;
+
+/* Inline code. */
+code,
+code.highlighter-rouge
+{ font-size: 0.85em !important; }
+
+/* Code blocks. */
+pre.highlight code { font-size: 0.9em !important; }
+
+/* Doxybook generated code snippets. */
+code.doxybook { display: block; }
+
+/* Line wrap with an indent of four characters in Doxybook-generated code snippets. */
+code.doxybook span
+{ display: block; text-indent: -4ex !important; padding-left: 4ex !important; }
+
+/* Line wrap with an indent of eight characters in Doxybook-generated code snippets. */
+code.doxybook span
+{ display: block; text-indent: -8ex !important; padding-left: 8ex !important; }
+
+/* Disable line wrap for indent <span>s. */
+code.doxybook
+{ display: block; text-indent: 0ex !important; padding-left: 0ex !important; }
+
+h3 { margin-bottom: 1.0em !important; }
+
+$nav-width: 300px;
+
+$body-background-color: $grey-dk-300;
+$sidebar-color: $grey-dk-300;
+$border-color: $grey-dk-200;
+
+$body-text-color: $grey-lt-300;
+$body-heading-color: $grey-lt-000;
+$nav-child-link-color: $grey-dk-000;
+$search-result-preview-color: $grey-dk-000;
+
+$link-color: #76b900;
+$btn-primary-color: #76b900;
+$base-button-color: $grey-dk-250;
+
+$code-background-color: $grey-dk-250;
+$search-background-color: $grey-dk-250;
+$table-background-color: $grey-dk-250;
+$feedback-color: darken($sidebar-color, 3%);
+
+div.highlighter-rouge,
+pre.highlight code,
+code.doxybook
+{ background-color: #111 !important; }
+
+span.doxybook-comment code
+{ background-color: #111 !important; border: none !important; }
+
+.highlight span.err { color: #ff0000; font-weight: bold; } /* Error */
+
+.highlight span.ow, /* Operator.Word */
+.highlight span.k,  /* Keyword */
+.highlight span.kc, /* Keyword.Constant */
+.highlight span.kd, /* Keyword.Declaration */
+.highlight span.kp, /* Keyword.Pseudo */
+.highlight span.kr, /* Keyword.Reserved */
+.highlight span.bp, /* Name.Builtin.Pseudo */
+.highlight span.vc, /* Name.Variable.Class */
+.highlight span.vg, /* Name.Variable.Global */
+.highlight span.vi  /* Name.Variable.Instance */
+{ color: #76b900; font-weight: bold; }
+
+.highlight span.n,  /* Name */
+.highlight span.h,  /* Name */
+.highlight span.na, /* Name.Attribute */
+.highlight span.nb, /* Name.Builtin */
+.highlight span.nc, /* Name.Class */
+.highlight span.no, /* Name.Constant */
+.highlight span.nd, /* Name.Decorator */
+.highlight span.ni, /* Name.Entity */
+.highlight span.ne, /* Name.Exception */
+.highlight span.nf, /* Name.Function */
+.highlight span.nl, /* Name.Label */
+.highlight span.nn, /* Name.Namespace */
+.highlight span.nx, /* Name.Other */
+.highlight span.py, /* Name.Property */
+.highlight span.nt, /* Name.Tag */
+.highlight span.nv, /* Name.Variable */
+.highlight span.kt  /* Keyword.Type */
+{ color: $grey-lt-300 }
+
+.highlight span.c,  /* Comment */
+.highlight span.cm, /* Comment.Multiline */
+.highlight span.c1, /* Comment.Single */
+.highlight span.cs, /* Comment.Special */
+span.doxybook-comment
+{ color: #009966; font-family: $body-font-family; font-style: italic; }
+
+.highlight span.cp  /* Preprocessor */
+.highlight span.kn, /* Keyword.Namespace */
+{ color: $grey-dk-000 }
+
+.highlight span.o, /* Operator */
+.highlight span.p  /* Punctuation */
+{ color: #00ff00; }
+
+.highlight span.ge { font-style: italic; } /* Generic.Emph */
+
+.highlight span.gs { font-weight: bold; } /* Generic.Strong */
+
+.highlight span.l,  /* Literal */
+.highlight span.ld, /* Literal.Date */
+.highlight span.m,  /* Literal.Number */
+.highlight span.mf, /* Literal.Number.Float */
+.highlight span.mh, /* Literal.Number.Hex */
+.highlight span.mi, /* Literal.Number.Integer */
+.highlight span.mo, /* Literal.Number.Oct */
+.highlight span.il, /* Literal.Number.Integer.Long */
+.highlight span.s,  /* Literal.String */
+.highlight span.sb, /* Literal.String.Backtick */
+.highlight span.sc, /* Literal.String.Char */
+.highlight span.sd, /* Literal.String.Doc */
+.highlight span.s2, /* Literal.String.Double */
+.highlight span.se, /* Literal.String.Escape */
+.highlight span.sh, /* Literal.String.Heredoc */
+.highlight span.si, /* Literal.String.Interpol */
+.highlight span.sx, /* Literal.String.Other */
+.highlight span.sr, /* Literal.String.Regex */
+.highlight span.s1, /* Literal.String.Single */
+.highlight span.ss  /* Literal.String.Symbol */
+{ color: #119911; }
+
+.highlight span.w { color: #00cc00; } /* Text.Whitespace */
+
+.highlight span.gh, /* Generic.Heading */
+.highlight span.gp, /* Generic.Prompt */
+.highlight span.gu  /* Generic.Subheading */
+{ color: #00ff00; font-weight: bold; }
+
+.highlight span.gd { color: #ff0000; } /* Generic.Deleted */
+.highlight span.gi { color: #00ff00; } /* Generic.Inserted */
+
+.search-input { color: $body-text-color; }
diff --git a/docs/github_pages/api.md b/docs/github_pages/api.md
new file mode 100644
index 000000000..6a2d1af43
--- /dev/null
+++ b/docs/github_pages/api.md
@@ -0,0 +1,8 @@
+---
+has_children: true
+has_toc: true
+nav_order: 2
+---
+
+# API
+
diff --git a/docs/github_pages/assets/images/nvidia_logo.png b/docs/github_pages/assets/images/nvidia_logo.png
new file mode 100644
index 000000000..6b005a283
Binary files /dev/null and b/docs/github_pages/assets/images/nvidia_logo.png differ
diff --git a/docs/github_pages/contributing.md b/docs/github_pages/contributing.md
new file mode 100644
index 000000000..6539768c4
--- /dev/null
+++ b/docs/github_pages/contributing.md
@@ -0,0 +1,10 @@
+---
+has_children: true
+has_toc: true
+nav_order: 4
+---
+
+# Contributing
+
+We welcome contributions - just send us a pull request!
+
diff --git a/docs/github_pages/contributing/release_process.md b/docs/github_pages/contributing/release_process.md
new file mode 100644
index 000000000..db21f60b4
--- /dev/null
+++ b/docs/github_pages/contributing/release_process.md
@@ -0,0 +1,85 @@
+---
+parent: Contributing
+nav_order: 1
+---
+
+# Release Process
+
+## Create a Changelog Entry
+
+Every release must have a changelog entry.
+The changelog entry should include:
+* A summary of the major accomplishments of the release.
+* A list of all the changes in the release.
+* A list of all the bugs fixed by the release.
+
+Contributions from new collaborators should be acknowledged in the changelog.
+
+## Create Git Annotated Tags and GitHub Releases
+
+Each release needs to have a Git annotated tag and a GitHub release for that tag.
+The changelog for the release should be used for the text of the GitHub release.
+
+## Update Compiler Explorer
+
+Thrust and CUB are bundled together on
+[Compiler Explorer](https://www.godbolt.org/) (CE) as libraries for the CUDA
+language. When releasing a new version of these projects, CE will need to be
+updated.
+
+There are two files in two repos that need to be updated:
+
+### libraries.yaml
+
+- Repo: https://github.com/compiler-explorer/infra
+- Path: bin/yaml/libraries.yaml
+
+This file tells CE how to pull in library files and defines which versions to
+fetch. Look for the `thrustcub:` section:
+
+```yaml
+    thrustcub:
+      type: github
+      method: clone_branch
+      repo: NVIDIA/thrust
+      check_file: dependencies/cub/cub/cub.cuh
+      targets:
+        - 1.9.9
+        - 1.9.10
+        - 1.9.10-1
+        - 1.10.0
+```
+
+Simply add the new version tag to list of `targets:`. This will check out the
+specified tag to `/opt/compiler-explorer/libs/thrustcub/<tag>/`.
+
+### cuda.amazon.properties
+
+- Repo: https://github.com/compiler-explorer/compiler-explorer
+- File: etc/config/cuda.amazon.properties
+
+This file defines the library versions displayed in the CE UI and maps them
+to a set of include directories. Look for the `libs.thrustcub` section:
+
+```yaml
+libs.thrustcub.name=Thrust+CUB
+libs.thrustcub.description=CUDA collective and parallel algorithms
+libs.thrustcub.versions=trunk:109090:109100:109101:110000
+libs.thrustcub.url=http://www.github.com/NVIDIA/thrust
+libs.thrustcub.versions.109090.version=1.9.9
+libs.thrustcub.versions.109090.path=/opt/compiler-explorer/libs/thrustcub/1.9.9:/opt/compiler-explorer/libs/thrustcub/1.9.9/dependencies/cub
+libs.thrustcub.versions.109100.version=1.9.10
+libs.thrustcub.versions.109100.path=/opt/compiler-explorer/libs/thrustcub/1.9.10:/opt/compiler-explorer/libs/thrustcub/1.9.10/dependencies/cub
+libs.thrustcub.versions.109101.version=1.9.10-1
+libs.thrustcub.versions.109101.path=/opt/compiler-explorer/libs/thrustcub/1.9.10-1:/opt/compiler-explorer/libs/thrustcub/1.9.10-1/dependencies/cub
+libs.thrustcub.versions.110000.version=1.10.0
+libs.thrustcub.versions.110000.path=/opt/compiler-explorer/libs/thrustcub/1.10.0:/opt/compiler-explorer/libs/thrustcub/1.10.0/dependencies/cub
+libs.thrustcub.versions.trunk.version=trunk
+libs.thrustcub.versions.trunk.path=/opt/compiler-explorer/libs/thrustcub/trunk:/opt/compiler-explorer/libs/thrustcub/trunk/dependencies/cub
+```
+
+Add a new version identifier to the `libs.thrustcub.versions` key, using the
+convention `X.Y.Z-W -> XXYYZZWW`. Then add a corresponding UI label (the
+`version` key) and set of colon-separated include paths for Thrust and CUB
+(`path`). The version used in the `path` entries must exactly match the tag
+specified in `libraries.yaml`.
diff --git a/docs/github_pages/contributing/submitting_a_pr.md b/docs/github_pages/contributing/submitting_a_pr.md
new file mode 100644
index 000000000..9c1757655
--- /dev/null
+++ b/docs/github_pages/contributing/submitting_a_pr.md
@@ -0,0 +1,295 @@
+---
+parent: Contributing
+nav_order: 0
+---
+
+# Submitting a PR
+
+Thrust uses Github to manage all open-source development, including bug
+tracking, pull requests, and design discussions. This document details how to get
+started as a Thrust contributor.
+
+An overview of this process is:
+
+1. [Clone the Thrust repository](#clone-the-thrust-repository)
+1. [Setup a fork of Thrust](#setup-a-fork-of-thrust)
+1. [Setup your environment](#setup-your-environment)
+1. [Create a development branch](#create-a-development-branch)
+1. [Local development loop](#local-development-loop)
+1. [Push development branch to your fork](#push-development-branch-to-your-fork)
+1. [Create pull request](#create-pull-request)
+1. [Address feedback and update pull request](#address-feedback-and-update-pull-request)
+1. [When your PR is approved...](#when-your-pr-is-approved)
+
+## Clone the Thrust Repository
+
+To get started, clone the main repository to your local computer. Thrust should
+be cloned recursively to setup the CUB submodule (required for `CUDA`
+acceleration).
+
+```
+git clone --recursive https://github.com/NVIDIA/thrust.git
+cd thrust
+```
+
+## Setup a Fork of Thrust
+
+You'll need a fork of Thrust on Github to create a pull request. To setup your
+fork:
+
+1. Create a Github account (if needed)
+2. Go to [the Thrust Github page](https://github.com/NVIDIA/thrust)
+3. Click "Fork" and follow any prompts that appear.
+
+Once your fork is created, setup a new remote repo in your local Thrust clone:
+
+```
+git remote add github-fork git@github.com:<GITHUB_USERNAME>/thrust.git
+```
+
+If you need to modify CUB, too, go to
+[the CUB Github page](https://github.com/NVIDIA/cub) and repeat this process.
+Create CUB's `github-fork` remote in the `thrust/dependencies/cub` submodule.
+
+## Setup Your Environment
+
+### Git Environment
+
+If you haven't already, this is a good time to tell git who you are. This
+information is used to fill out authorship information on your git commits.
+
+```
+git config --global user.name "John Doe"
+git config --global user.email johndoe@example.com
+```
+
+### Configure CMake builds
+
+Thrust uses [CMake](https://www.cmake.org) for its primary build system. To
+configure, build, and test your checkout of Thrust:
+
+```
+# Create build directory:
+mkdir build
+cd build
+
+# Configure -- use one of the following:
+cmake ..                                 # Command line interface
+cmake -DTHRUST_INCLUDE_CUB_CMAKE=ON ..   # Enables CUB development targets
+ccmake ..                # ncurses GUI (Linux only)
+cmake-gui                # Graphical UI, set source/build directories in the app
+
+# Build:
+cmake --build . -j <num jobs>   # invokes make (or ninja, etc)
+
+# Run tests and examples:
+ctest
+```
+
+See [CMake Options](../setup/cmake_options.md) for details on customizing the build. To
+enable CUB tests and examples, set the `THRUST_INCLUDE_CUB_CMAKE` option to
+`ON`. Additional CMake options for CUB are listed
+[here](https://github.com/NVIDIA/cub/blob/main/CONTRIBUTING.md#cmake-options).
+
+## Create a Development Branch
+
+All work should be done in a development branch (also called a "topic branch")
+and not directly in the `main` branch. This makes it easier to manage multiple
+in-progress patches at once, and provides a descriptive label for your patch
+as it passes through the review system.
+
+To create a new branch based on the current `main`:
+
+```
+# Checkout local main branch:
+cd /path/to/thrust/sources
+git checkout main
+
+# Sync local main branch with github:
+git pull
+
+# Create a new branch named `my_descriptive_branch_name` based on main:
+git checkout -b my_descriptive_branch_name
+
+# Verify that the branch has been created and is currently checked out:
+git branch
+```
+
+Thrust branch names should follow a particular pattern:
+
+- For new features, name the branch `feature/<name>`
+- For bugfixes associated with a github issue, use `bug/github/<bug-description>-<bug-id>`
+  - Internal nvidia and gitlab bugs should use `nvidia` or `gitlab` in place of
+    `github`.
+
+If you plan to work on CUB as part of your patch, repeat this process in the
+`thrust/dependencies/cub` submodule.
+
+## Local Development Loop
+
+### Edit, Build, Test, Repeat
+
+Once the topic branch is created, you're all set to start working on Thrust
+code. Make some changes, then build and test them:
+
+```
+# Implement changes:
+cd /path/to/thrust/sources
+emacs thrust/some_file.h # or whatever editor you prefer
+
+# Create / update a unit test for your changes:
+emacs testing/some_test.cu
+
+# Check that everything builds and tests pass:
+cd /path/to/thrust/build/directory
+cmake --build . -j <num jobs>
+ctest
+```
+
+### Creating a Commit
+
+Once you're satisfied with your patch, commit your changes:
+
+#### Thrust-only Changes
+
+```
+# Manually add changed files and create a commit:
+cd /path/to/thrust
+git add thrust/some_file.h
+git add testing/some_test.cu
+git commit
+
+# Or, if possible, use git-gui to review your changes while building your patch:
+git gui
+```
+
+#### Thrust and CUB Changes
+
+```
+# Create CUB patch first:
+cd /path/to/thrust/dependencies/cub
+# Manually add changed files and create a commit:
+git add cub/some_file.cuh
+git commit
+
+# Create Thrust patch, including submodule update:
+cd /path/to/thrust/
+git add dependencies/cub # Updates submodule info
+git add thrust/some_file.h
+git add testing/some_test.cu
+git commit
+
+# Or, if possible, use git-gui to review your changes while building your patch:
+cd /path/to/thrust/dependencies/cub
+git gui
+cd /path/to/thrust
+git gui # Include dependencies/cub as part of your commit
+
+```
+
+#### Writing a Commit Message
+
+Your commit message will communicate the purpose and rationale behind your
+patch to other developers, and will be used to populate the initial description
+of your Github pull request.
+
+When writing a commit message, the following standard format should be used,
+since tools in the git ecosystem are designed to parse this correctly:
+
+```
+First line of commit message is a short summary (<80 char)
+<Second line left blank>
+Detailed description of change begins on third line. This portion can
+span multiple lines, try to manually wrap them at something reasonable.
+
+Blank lines can be used to separate multiple paragraphs in the description.
+
+If your patch is associated with another pull request or issue in the main
+Thrust repository, you should reference it with a `#` symbol, e.g.
+#1023 for issue 1023.
+
+For issues / pull requests in a different github repo, reference them using
+the full syntax, e.g. NVIDIA/cub#4 for issue 4 in the NVIDIA/cub repo.
+
+Markdown is recommended for formatting more detailed messages, as these will
+be nicely rendered on Github, etc.
+```
+
+## Push Development Branch to your Fork
+
+Once you've committed your changes to a local development branch, it's time to
+push them to your fork:
+
+```
+cd /path/to/thrust/checkout
+git checkout my_descriptive_branch_name # if not already checked out
+git push --set-upstream github-fork my_descriptive_branch_name
+```
+
+`--set-upstream github-fork` tells git that future pushes/pulls on this branch
+should target your `github-fork` remote by default.
+
+If have CUB changes to commit as part of your patch, repeat this process in the
+`thrust/dependencies/cub` submodule.
+
+## Create Pull Request
+
+To create a pull request for your freshly pushed branch, open your github fork
+in a browser by going to `https://www.github.com/<GITHUB_USERNAME>/thrust`. A
+prompt may automatically appear asking you to create a pull request if you've
+recently pushed a branch.
+
+If there's no prompt, go to "Code" > "Branches" and click the appropriate
+"New pull request" button for your branch.
+
+If you would like a specific developer to review your patch, feel free to
+request them as a reviewer at this time.
+
+The Thrust team will review your patch, test it on NVIDIA's internal CI, and
+provide feedback.
+
+
+If have CUB changes to commit as part of your patch, repeat this process with
+your CUB branch and fork.
+
+## Address Feedback and Update Pull Request
+
+If the reviewers request changes to your patch, use the following process to
+update the pull request:
+
+```
+# Make changes:
+cd /path/to/thrust/sources
+git checkout my_descriptive_branch_name
+emacs thrust/some_file.h
+emacs testing/some_test.cu
+
+# Build + test
+cd /path/to/thrust/build/directory
+cmake --build . -j <num jobs>
+ctest
+
+# Amend commit:
+cd /path/to/thrust/sources
+git add thrust/some_file.h
+git add testing/some_test.cu
+git commit --amend
+# Or
+git gui # Check the "Amend Last Commit" box
+
+# Update the branch on your fork:
+git push -f
+```
+
+At this point, the pull request should show your recent changes.
+
+If have CUB changes to commit as part of your patch, repeat this process in the
+`thrust/dependencies/cub` submodule, and be sure to include any CUB submodule
+updates as part of your commit.
+
+## When Your PR is Approved
+
+Once your pull request is approved by the Thrust team, no further action is
+needed from you. We will handle integrating it since we must coordinate changes
+to `main` with NVIDIA's internal perforce repository.
+
diff --git a/docs/github_pages/favicon.ico b/docs/github_pages/favicon.ico
new file mode 100644
index 000000000..424df8720
Binary files /dev/null and b/docs/github_pages/favicon.ico differ
diff --git a/docs/github_pages/releases.md b/docs/github_pages/releases.md
new file mode 100644
index 000000000..81a5f2f3d
--- /dev/null
+++ b/docs/github_pages/releases.md
@@ -0,0 +1,60 @@
+---
+has_children: true
+has_toc: true
+nav_order: 3
+---
+
+# Releases
+
+| Version         | Included In                               |
+|-----------------|-------------------------------------------|
+| 2.0.1           | CUDA Toolkit 12.0                         |
+| 2.0.0           | TBD                                       |
+| 1.17.2          | TBD                                       |
+| 1.17.1          | TBD                                       |
+| 1.17.0          | TBD                                       |
+| 1.16.0          | TBD                                       |
+| 1.15.0          | NVIDIA HPC SDK 22.1 & CUDA Toolkit 11.6   |
+| 1.14.0          | NVIDIA HPC SDK 21.9                       |
+| 1.13.1          | CUDA Toolkit 11.5                         |
+| 1.13.1          | CUDA Toolkit 11.5                         |
+| 1.13.0          | NVIDIA HPC SDK 21.7                       |
+| 1.12.1          | CUDA Toolkit 11.4                         |
+| 1.12.0          | NVIDIA HPC SDK 21.3                       |
+| 1.11.0          | CUDA Toolkit 11.3                         |
+| 1.10.0          | NVIDIA HPC SDK 20.9 & CUDA Toolkit 11.2   |
+| 1.9.10-1        | NVIDIA HPC SDK 20.7 & CUDA Toolkit 11.1   |
+| 1.9.10          | NVIDIA HPC SDK 20.5                       |
+| 1.9.9           | CUDA Toolkit 11.0                         |
+| 1.9.8-1         | NVIDIA HPC SDK 20.3                       |
+| 1.9.8           | CUDA Toolkit 11.0 Early Access            |
+| 1.9.7-1         | CUDA Toolkit 10.2 for Tegra               |
+| 1.9.7           | CUDA Toolkit 10.2                         |
+| 1.9.6-1         | NVIDIA HPC SDK 20.3                       |
+| 1.9.6           | CUDA Toolkit 10.1 Update 2                |
+| 1.9.5           | CUDA Toolkit 10.1 Update 1                |
+| 1.9.4           | CUDA Toolkit 10.1                         |
+| 1.9.3           | CUDA Toolkit 10.0                         |
+| 1.9.2           | CUDA Toolkit 9.2                          |
+| 1.9.1-2         | CUDA Toolkit 9.1                          |
+| 1.9.0-5         | CUDA Toolkit 9.0                          |
+| 1.8.3           | CUDA Toolkit 8.0                          |
+| 1.8.2           | CUDA Toolkit 7.5                          |
+| 1.8.1           | CUDA Toolkit 7.0                          |
+| 1.8.0           |                                           |
+| 1.7.2           | CUDA Toolkit 6.5                          |
+| 1.7.1           | CUDA Toolkit 6.0                          |
+| 1.7.0           | CUDA Toolkit 5.5                          |
+| 1.6.0           |                                           |
+| 1.5.3           | CUDA Toolkit 5.0                          |
+| 1.5.2           | CUDA Toolkit 4.2                          |
+| 1.5.1           | CUDA Toolkit 4.1                          |
+| 1.5.0           |                                           |
+| 1.4.0           | CUDA Toolkit 4.0                          |
+| 1.3.0           |                                           |
+| 1.2.1           |                                           |
+| 1.2.0           |                                           |
+| 1.1.1           |                                           |
+| 1.1.0           |                                           |
+| 1.0.0           |                                           |
+
diff --git a/docs/github_pages/releases/versioning.md b/docs/github_pages/releases/versioning.md
new file mode 100644
index 000000000..e5f0e8eb1
--- /dev/null
+++ b/docs/github_pages/releases/versioning.md
@@ -0,0 +1,71 @@
+---
+parent: Releases
+nav_order: 1
+---
+
+# Versioning
+
+Thrust has its own versioning system for releases, independent of the
+  versioning scheme of the NVIDIA HPC SDK or the CUDA Toolkit.
+
+Today, Thrust version numbers have a specific [semantic meaning](https://semver.org/).
+Releases prior to 1.10.0 largely, but not strictly, followed these semantic
+  meanings.
+
+The version number for a Thrust release uses the following format:
+  `MMM.mmm.ss-ppp`, where:
+
+* `THRUST_VERSION_MAJOR`/`MMM`: Major version, up to 3 decimal digits.
+  It is incremented when changes that are API-backwards-incompatible are made.
+* `THRUST_VERSION_MINOR`/`mmm`: Minor version, up to 3 decimal digits.
+  It is incremented when breaking API, ABI, or semantic changes are made.
+* `THRUST_VERSION_SUBMINOR`/`ss`: Subminor version, up to 2 decimal digits.
+  It is incremented when notable new features or bug fixes or features that are
+  API-backwards-compatible are made.
+* `THRUST_PATCH_NUMBER`/`ppp`: Patch number, up to 3 decimal digits.
+  This is no longer used and will be zero for all future releases.
+
+The `<thrust/version.h>` header defines `THRUST_*` macros for all of the
+  version components mentioned above.
+Additionally, a `THRUST_VERSION` macro is defined, which is an integer literal
+  containing all of the version components except for `THRUST_PATCH_NUMBER`.
+
+## Trunk Based Development
+
+Thrust uses [trunk based development](https://trunkbaseddevelopment.com).
+There is a single long-lived branch called `main`, which is public and the
+  "source of truth".
+All other branches are downstream from `main`.
+Engineers may create branches for feature development.
+Such branches always merge into `main`.
+There are no release branches.
+Releases are produced by taking a snapshot of `main` ("snapping").
+After a release has been snapped from `main`, it will never be changed.
+
+## Branches and Tags
+
+The following tag names are used in the Thrust project:
+
+* `nvhpc-X.Y`: the tag that directly corresponds to what has been
+  shipped in the NVIDIA HPC SDK release X.Y.
+* `cuda-X.Y`: the tag that directly corresponds to what has been shipped
+  in the CUDA Toolkit release X.Y.
+* `A.B.C`: the tag that directly corresponds to Thrust version A.B.C.
+* `A.B.C-rcN`: the tag that directly corresponds to Thrust version A.B.C
+  release candidate N.
+
+The following branch names are used in the Thrust project:
+
+* `main`: the "source of truth" development branch of Thrust.
+* `old-master`: the old "source of truth" branch, before unification of
+  public and internal repositories.
+* `feature/<name>`: feature branch for a feature under development.
+* `bug/<bug-system>/<bug-description>-<bug-id>`: bug fix branch, where
+  `bug-system` is `github` or `nvidia`.
+
+On the rare occasion that we cannot do work in the open, for example when
+  developing a change specific to an unreleased product, these branches may
+  exist on an internal NVIDIA GitLab instance instead of the public GitHub.
+By default, everything should be in the open on GitHub unless there is a strong
+  motivation for it to not be open.
+
diff --git a/docs/github_pages/setup.md b/docs/github_pages/setup.md
new file mode 100644
index 000000000..edbef2e5c
--- /dev/null
+++ b/docs/github_pages/setup.md
@@ -0,0 +1,7 @@
+---
+has_children: true
+has_toc: true
+nav_order: 1
+---
+
+# Setup
diff --git a/docs/github_pages/setup/cmake_options.md b/docs/github_pages/setup/cmake_options.md
new file mode 100644
index 000000000..b62faddeb
--- /dev/null
+++ b/docs/github_pages/setup/cmake_options.md
@@ -0,0 +1,139 @@
+---
+parent: Setup
+nav_order: 1
+---
+
+# CMake Options
+
+A Thrust build is configured using CMake options. These may be passed to CMake
+using
+
+```
+cmake -D<option_name>=<value> /path/to/thrust/sources
+```
+
+or configured interactively with the `ccmake` or `cmake-gui` interfaces.
+
+Thrust supports two build modes. By default, a single configuration is built
+that targets a specific host system, device system, and C++ dialect.
+When `THRUST_ENABLE_MULTICONFIG` is `ON`, multiple configurations
+targeting a variety of systems and dialects are generated.
+
+The CMake options are divided into these categories:
+
+1. [Generic CMake Options](#generic-cmake-options): Options applicable to all
+   Thrust builds.
+1. [Single Config CMake Options](#single-config-cmake-options) Options
+   applicable only when `THRUST_ENABLE_MULTICONFIG` is disabled.
+1. [Multi Config CMake Options](#multi-config-cmake-options) Options applicable
+   only when `THRUST_ENABLE_MULTICONFIG` is enabled.
+1. [CUDA Specific CMake Options](#cuda-specific-cmake-options) Options that
+   control CUDA compilation. Only available when one or more configurations
+   targets the CUDA system.
+1. [TBB Specific CMake Options](#tbb-specific-cmake-options) Options that
+   control TBB compilation. Only available when one or more configurations
+   targets the TBB system.
+
+## Generic CMake Options
+
+- `CMAKE_BUILD_TYPE={Release, Debug, RelWithDebInfo, MinSizeRel}`
+  - Standard CMake build option. Default: `RelWithDebInfo`
+- `THRUST_ENABLE_HEADER_TESTING={ON, OFF}`
+  - Whether to test compile public headers. Default is `ON`.
+- `THRUST_ENABLE_TESTING={ON, OFF}`
+  - Whether to build unit tests. Default is `ON`.
+- `THRUST_ENABLE_EXAMPLES={ON, OFF}`
+  - Whether to build examples. Default is `ON`.
+- `THRUST_ENABLE_MULTICONFIG={ON, OFF}`
+  - Toggles single-config and multi-config modes. Default is `OFF` (single config).
+- `THRUST_ENABLE_EXAMPLE_FILECHECK={ON, OFF}`
+  - Enable validation of example outputs using the LLVM FileCheck utility.
+    Default is `OFF`.
+- `THRUST_ENABLE_INSTALL_RULES={ON, OFF}`
+  - If true, installation rules will be generated for thrust. Default is `ON`.
+
+## Single Config CMake Options
+
+- `THRUST_HOST_SYSTEM={CPP, TBB, OMP}`
+  - Selects the host system. Default: `CPP`
+- `THRUST_DEVICE_SYSTEM={CUDA, TBB, OMP, CPP}`
+  - Selects the device system. Default: `CUDA`
+- `THRUST_CPP_DIALECT={11, 14, 17}`
+  - Selects the C++ standard dialect to use. Default is `14` (C++14).
+
+## Multi Config CMake Options
+
+- `THRUST_MULTICONFIG_ENABLE_DIALECT_CPPXX={ON, OFF}`
+  - Toggle whether a specific C++ dialect will be targeted.
+  - Possible values of `XX` are `{11, 14, 17}`.
+  - By default, only C++14 is enabled.
+- `THRUST_MULTICONFIG_ENABLE_SYSTEM_XXXX={ON, OFF}`
+  - Toggle whether a specific system will be targeted.
+  - Possible values of `XXXX` are `{CPP, CUDA, TBB, OMP}`
+  - By default, only `CPP` and `CUDA` are enabled.
+- `THRUST_MULTICONFIG_WORKLOAD={SMALL, MEDIUM, LARGE, FULL}`
+  - Restricts the host/device combinations that will be targeted.
+  - By default, the `SMALL` workload is used.
+  - The full cross product of `host x device` systems results in 12
+    configurations, some of which are more important than others.
+    This option can be used to prune some of the less important ones.
+  - `SMALL`: (3 configs) Minimal coverage and validation of each device system against the `CPP` host.
+  - `MEDIUM`: (6 configs) Cheap extended coverage.
+  - `LARGE`: (8 configs) Expensive extended coverage. Includes all useful build configurations.
+  - `FULL`: (12 configs) The complete cross product of all possible build configurations.
+
+| Config   | Workloads | Value      | Expense   | Note                         |
+|----------|-----------|------------|-----------|------------------------------|
+| CPP/CUDA | `F L M S` | Essential  | Expensive | Validates CUDA against CPP   |
+| CPP/OMP  | `F L M S` | Essential  | Cheap     | Validates OMP against CPP    |
+| CPP/TBB  | `F L M S` | Essential  | Cheap     | Validates TBB against CPP    |
+| CPP/CPP  | `F L M  ` | Important  | Cheap     | Tests CPP as device          |
+| OMP/OMP  | `F L M  ` | Important  | Cheap     | Tests OMP as host            |
+| TBB/TBB  | `F L M  ` | Important  | Cheap     | Tests TBB as host            |
+| TBB/CUDA | `F L    ` | Important  | Expensive | Validates TBB/CUDA interop   |
+| OMP/CUDA | `F L    ` | Important  | Expensive | Validates OMP/CUDA interop   |
+| TBB/OMP  | `F      ` | Not useful | Cheap     | Mixes CPU-parallel systems   |
+| OMP/TBB  | `F      ` | Not useful | Cheap     | Mixes CPU-parallel systems   |
+| TBB/CPP  | `F      ` | Not Useful | Cheap     | Parallel host, serial device |
+| OMP/CPP  | `F      ` | Not Useful | Cheap     | Parallel host, serial device |
+
+## CUDA Specific CMake Options
+
+- `THRUST_INCLUDE_CUB_CMAKE={ON, OFF}`
+  - If enabled, the CUB project will be built as part of Thrust. Default is
+    `OFF`.
+  - This adds CUB tests, etc. Useful for working on both CUB and Thrust
+    simultaneously.
+  - CUB configurations will be generated for each C++ dialect targeted by
+    the current Thrust build.
+- `THRUST_INSTALL_CUB_HEADERS={ON, OFF}`
+  - If enabled, the CUB project's headers will be installed through Thrust's
+    installation rules. Default is `ON`.
+  - This option depends on `THRUST_ENABLE_INSTALL_RULES`.
+- `THRUST_ENABLE_COMPUTE_XX={ON, OFF}`
+  - Controls the targeted CUDA architecture(s)
+  - Multiple options may be selected when using NVCC as the CUDA compiler.
+  - Valid values of `XX` are:
+    `{35, 37, 50, 52, 53, 60, 61, 62, 70, 72, 75, 80}`
+  - Default value depends on `THRUST_DISABLE_ARCH_BY_DEFAULT`:
+- `THRUST_ENABLE_COMPUTE_FUTURE={ON, OFF}`
+  - If enabled, CUDA objects will target the most recent virtual architecture
+    in addition to the real architectures specified by the
+    `THRUST_ENABLE_COMPUTE_XX` options.
+  - Default value depends on `THRUST_DISABLE_ARCH_BY_DEFAULT`:
+- `THRUST_DISABLE_ARCH_BY_DEFAULT={ON, OFF}`
+  - When `ON`, all `THRUST_ENABLE_COMPUTE_*` options are initially `OFF`.
+  - Default: `OFF` (meaning all architectures are enabled by default)
+- `THRUST_ENABLE_TESTS_WITH_RDC={ON, OFF}`
+  - Whether to enable Relocatable Device Code when building tests.
+    Default is `OFF`.
+- `THRUST_ENABLE_EXAMPLES_WITH_RDC={ON, OFF}`
+  - Whether to enable Relocatable Device Code when building examples.
+    Default is `OFF`.
+
+## TBB Specific CMake Options
+
+- `THRUST_TBB_ROOT=<path to tbb root>`
+  - When the TBB system is requested, set this to the root of the TBB installation
+    (e.g. the location of `lib/`, `bin/` and `include/` for the TBB libraries).
+
diff --git a/docs/github_pages/setup/requirements.md b/docs/github_pages/setup/requirements.md
new file mode 100644
index 000000000..9d5316456
--- /dev/null
+++ b/docs/github_pages/setup/requirements.md
@@ -0,0 +1,82 @@
+---
+parent: Setup
+nav_order: 0
+---
+
+# Requirements
+
+All requirements are applicable to the `main` branch on GitHub.
+For details on specific releases, please see the [CHANGELOG.md].
+
+## Usage Requirements
+
+To use the NVIDIA C++ Standard Library, you must meet the following
+  requirements.
+
+### System Software
+
+Thrust and CUB require either the [NVIDIA HPC SDK] or the [CUDA Toolkit].
+
+Releases of Thrust and CUB are only tested against the latest releases of NVHPC
+  and CUDA.
+It may be possible to use newer version of Thrust and CUB with an older NVHPC or
+  CUDA installation by using a Thrust and CUB release from GitHub, but please
+  be aware this is not officially supported.
+
+### C++ Dialects
+
+Thrust and CUB support the following C++ dialects:
+
+- C++11 (deprecated)
+- C++14
+- C++17
+
+### Compilers
+
+Thrust and CUB support the following compilers when used in conjunction with
+  NVCC:
+
+- NVCC (latest version)
+- NVC++ (latest version)
+- GCC 5+
+- Clang 7+
+- MSVC 2019+ (19.20/16.0/14.20)
+
+Unsupported versions may emit deprecation warnings, which can be
+  silenced by defining `THRUST_IGNORE_DEPRECATED_COMPILER` during compilation.
+
+### Device Architectures
+
+Thrust and CUB support all NVIDIA device architectures since SM 35.
+
+### Host Architectures
+
+Thrust and CUB support the following host architectures:
+
+- aarch64.
+- x86-64.
+- ppc64le.
+
+### Host Operating Systems
+
+Thrust and CUB support the following host operating systems:
+
+- Linux.
+- Windows.
+
+## Build and Test Requirements
+
+To build and test Thrust and CUB yourself, you will need the following in
+  addition to the above requirements:
+
+- [CMake].
+
+
+
+[changelog]: ./releases/changelog.md
+
+[NVIDIA HPC SDK]: https://developer.nvidia.com/hpc-sdk
+[CUDA Toolkit]: https://developer.nvidia.com/cuda-toolkit
+
+[CMake]: https://cmake.org
+
diff --git a/docs/serve_docs_locally.bash b/docs/serve_docs_locally.bash
new file mode 100755
index 000000000..f438795e4
--- /dev/null
+++ b/docs/serve_docs_locally.bash
@@ -0,0 +1,35 @@
+#! /usr/bin/env bash
+
+###############################################################################
+# Copyright (c) 2018-2021 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+###############################################################################
+
+SCRIPT_PATH=$(cd $(dirname ${0}); pwd -P)
+
+REPO_PATH=${SCRIPT_PATH}/..
+
+BUILD_DOCS_PATH=build_docs
+BUILD_GITHUB_PAGES_PATH=${BUILD_DOCS_PATH}/github_pages
+
+cd ${REPO_PATH}/${BUILD_GITHUB_PAGES_PATH}
+
+bundle install
+bundle exec jekyll serve \
+  --verbose              \
+  --incremental          \
+  --profile              \
+  --baseurl "/thrust"    \
+  ${@}
+
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
new file mode 100644
index 000000000..306ecb7a3
--- /dev/null
+++ b/examples/CMakeLists.txt
@@ -0,0 +1,157 @@
+# Setup FileCheck if requested and available:
+option(THRUST_ENABLE_EXAMPLE_FILECHECK
+  "Check example output with the LLVM FileCheck utility."
+  OFF
+)
+set(filecheck_data_path "${Thrust_SOURCE_DIR}/internal/test")
+
+if (THRUST_ENABLE_EXAMPLE_FILECHECK)
+  # TODO this should go into a find module
+  find_program(THRUST_FILECHECK_EXECUTABLE
+    DOC "Path to the LLVM FileCheck utility."
+    NAMES
+      FileCheck
+      FileCheck-3.9
+      FileCheck-4.0
+      FileCheck-5.0
+      FileCheck-6.0
+      FileCheck-7
+      FileCheck-8
+      FileCheck-9
+  )
+
+  if (NOT THRUST_FILECHECK_EXECUTABLE)
+    message(FATAL_ERROR
+      "Could not find the LLVM FileCheck utility. Set THRUST_FILECHECK_EXECUTABLE manually, "
+      "or disable THRUST_ENABLE_EXAMPLE_FILECHECK."
+    )
+  endif()
+
+  execute_process(
+    COMMAND "${THRUST_FILECHECK_EXECUTABLE}" "${filecheck_data_path}/thrust.smoke.filecheck"
+    INPUT_FILE "${Thrust_SOURCE_DIR}/cmake/filecheck_smoke_test"
+    RESULT_VARIABLE exit_code
+  )
+
+  if (0 EQUAL exit_code)
+    message(STATUS "FileCheck enabled: ${THRUST_FILECHECK_EXECUTABLE}")
+  else()
+    message(FATAL_ERROR
+      "The current THRUST_FILECHECK_EXECUTABLE ('${THRUST_FILECHECK_EXECUTABLE}') "
+      "does not seem to be a valid FileCheck executable."
+    )
+  endif()
+endif()
+
+# Create meta targets that build all examples for a single configuration:
+foreach(thrust_target IN LISTS THRUST_TARGETS)
+  thrust_get_target_property(config_prefix ${thrust_target} PREFIX)
+  set(config_meta_target ${config_prefix}.examples)
+  add_custom_target(${config_meta_target})
+  add_dependencies(${config_prefix}.all ${config_meta_target})
+endforeach()
+
+# Update flags to reflect RDC options. See note in ThrustCudaConfig.cmake --
+# these flag variables behave unintuitively:
+if (THRUST_ENABLE_EXAMPLES_WITH_RDC)
+  set(CMAKE_CUDA_FLAGS "${THRUST_CUDA_FLAGS_BASE} ${THRUST_CUDA_FLAGS_RDC}")
+else()
+  set(CMAKE_CUDA_FLAGS "${THRUST_CUDA_FLAGS_BASE} ${THRUST_CUDA_FLAGS_NO_RDC}")
+endif()
+
+## thrust_add_example
+#
+# Add an example executable and register it with ctest.
+#
+# target_name_var: Variable name to overwrite with the name of the example
+#   target. Useful for post-processing target information per-backend.
+# example_name: The name of the example minus "<config_prefix>.example." For
+#   instance, examples/vector.cu will be "vector", and examples/cuda/copy.cu
+#   would be "cuda.copy".
+# example_src: The source file that implements the example.
+# thrust_target: The reference thrust target with configuration information.
+#
+function(thrust_add_example target_name_var example_name example_src thrust_target)
+  thrust_get_target_property(config_host ${thrust_target} HOST)
+  thrust_get_target_property(config_device ${thrust_target} DEVICE)
+  thrust_get_target_property(config_prefix ${thrust_target} PREFIX)
+
+  # Wrap the .cu file in .cpp for non-CUDA backends
+  if ("CUDA" STREQUAL "${config_device}")
+    set(real_example_src "${example_src}")
+  else()
+    thrust_wrap_cu_in_cpp(real_example_src "${example_src}" ${thrust_target})
+  endif()
+
+  # The actual name of the test's target:
+  set(example_target ${config_prefix}.example.${example_name})
+  set(${target_name_var} ${example_target} PARENT_SCOPE)
+
+  # Related target names:
+  set(config_meta_target ${config_prefix}.examples)
+  set(example_meta_target thrust.all.example.${example_name})
+
+  add_executable(${example_target} "${real_example_src}")
+  target_link_libraries(${example_target} ${thrust_target})
+  target_include_directories(${example_target} PRIVATE "${Thrust_SOURCE_DIR}/examples")
+  thrust_clone_target_properties(${example_target} ${thrust_target})
+  thrust_fix_clang_nvcc_build_for(${example_target})
+
+  # Add to the active configuration's meta target
+  add_dependencies(${config_meta_target} ${example_target})
+
+  # Meta target that builds examples with this name for all configurations:
+  if (NOT TARGET ${example_meta_target})
+    add_custom_target(${example_meta_target})
+  endif()
+  add_dependencies(${example_meta_target} ${example_target})
+
+  if ("CUDA" STREQUAL "${config_device}" AND
+      THRUST_ENABLE_EXAMPLES_WITH_RDC)
+    thrust_enable_rdc_for_cuda_target(${example_target})
+  endif()
+
+  if (NOT "Clang" STREQUAL "${CMAKE_CUDA_COMPILER_ID}")
+    target_compile_definitions(${example_target} PRIVATE THRUST_EXAMPLE_DEVICE_SIDE)
+  endif()
+
+  # Get the name of FileCheck input by stripping out the config name.
+  # (e.g. "thrust.cpp.cuda.cpp14.example.xxx" -> "thrust.example.xxx.filecheck")
+  string(REPLACE "${config_prefix}" "thrust"
+    filecheck_reference_file
+    "${example_target}.filecheck"
+  )
+
+  add_test(NAME ${example_target}
+    COMMAND "${CMAKE_COMMAND}"
+    "-DEXAMPLE_EXECUTABLE=$<TARGET_FILE:${example_target}>"
+    "-DFILECHECK_ENABLED=${THRUST_ENABLE_EXAMPLE_FILECHECK}"
+    "-DFILECHECK_EXECUTABLE=${THRUST_FILECHECK_EXECUTABLE}"
+    "-DREFERENCE_FILE=${filecheck_data_path}/${filecheck_reference_file}"
+    -P "${Thrust_SOURCE_DIR}/cmake/ThrustRunExample.cmake"
+  )
+
+  # Run OMP/TBB tests in serial. Multiple OMP processes will massively
+  # oversubscribe the machine with GCC's OMP, and we want to test these with
+  # the full CPU available to each unit test.
+  set(config_systems ${config_host} ${config_device})
+  if (("OMP" IN_LIST config_systems) OR ("TBB" IN_LIST config_systems))
+    set_tests_properties(${example_target} PROPERTIES RUN_SERIAL ON)
+  endif()
+endfunction()
+
+file(GLOB example_srcs
+  RELATIVE "${CMAKE_CURRENT_LIST_DIR}"
+  CONFIGURE_DEPENDS
+  *.cu *.cpp
+)
+
+foreach(thrust_target IN LISTS THRUST_TARGETS)
+  foreach(example_src IN LISTS example_srcs)
+    get_filename_component(example_name "${example_src}" NAME_WLE)
+    thrust_add_example(example_target ${example_name} "${example_src}" ${thrust_target})
+  endforeach()
+endforeach()
+
+add_subdirectory(cmake)
+add_subdirectory(cuda)
diff --git a/examples/README b/examples/README.md
similarity index 56%
rename from examples/README
rename to examples/README.md
index 4188534fe..8a43897bb 100644
--- a/examples/README
+++ b/examples/README.md
@@ -4,8 +4,4 @@ norm example.
   $ nvcc norm.cu -o norm
 
 These examples are also available online:
-  https://github.com/thrust/thrust/tree/master/examples
-
-For additional information refer to the Quick Start Guide:
-  https://github.com/thrust/thrust/wiki/Quick-Start-Guide
-
+  https://github.com/NVIDIA/thrust/tree/main/examples
diff --git a/examples/arbitrary_transformation.cu b/examples/arbitrary_transformation.cu
index d1a15096f..be22c2e5a 100644
--- a/examples/arbitrary_transformation.cu
+++ b/examples/arbitrary_transformation.cu
@@ -3,6 +3,12 @@
 #include <thrust/iterator/zip_iterator.h>
 #include <iostream>
 
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2011 && !defined(THRUST_LEGACY_GCC)
+#include <thrust/zip_function.h>
+#endif // >= C++11
+
 // This example shows how to implement an arbitrary transformation of
 // the form output[i] = F(first[i], second[i], third[i], ... ).
 // In this example, we use a function with 3 inputs and 1 output.
@@ -22,6 +28,10 @@
 //      D[i] = A[i] + B[i] * C[i];
 // by invoking arbitrary_functor() on each of the tuples using for_each.
 //
+// If we are using a functor that is not designed for zip iterators by taking a
+// tuple instead of individual arguments we can adapt this function using the
+// zip_function adaptor (C++11 only).
+//
 // Note that we could extend this example to implement functions with an
 // arbitrary number of input arguments by zipping more sequence together.
 // With the same approach we can have multiple *output* sequences, if we 
@@ -31,7 +41,7 @@
 //
 // The possibilities are endless! :)
 
-struct arbitrary_functor
+struct arbitrary_functor1
 {
     template <typename Tuple>
     __host__ __device__
@@ -42,6 +52,17 @@ struct arbitrary_functor
     }
 };
 
+#if THRUST_CPP_DIALECT >= 2011 && !defined(THRUST_LEGACY_GCC)
+struct arbitrary_functor2
+{
+    __host__ __device__
+    void operator()(const float& a, const float& b, const float& c, float& d)
+    {
+        // D[i] = A[i] + B[i] * C[i];
+        d = a + b * c;
+    }
+};
+#endif // >= C++11
 
 int main(void)
 {
@@ -49,7 +70,7 @@ int main(void)
     thrust::device_vector<float> A(5);
     thrust::device_vector<float> B(5);
     thrust::device_vector<float> C(5);
-    thrust::device_vector<float> D(5);
+    thrust::device_vector<float> D1(5);
 
     // initialize input vectors
     A[0] = 3;  B[0] = 6;  C[0] = 2; 
@@ -59,12 +80,26 @@ int main(void)
     A[4] = 2;  B[4] = 8;  C[4] = 3; 
 
     // apply the transformation
-    thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(A.begin(), B.begin(), C.begin(), D.begin())),
-                     thrust::make_zip_iterator(thrust::make_tuple(A.end(),   B.end(),   C.end(),   D.end())),
-                     arbitrary_functor());
+    thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(A.begin(), B.begin(), C.begin(), D1.begin())),
+                     thrust::make_zip_iterator(thrust::make_tuple(A.end(),   B.end(),   C.end(),   D1.end())),
+                     arbitrary_functor1());
+
+    // print the output
+    std::cout << "Tuple functor" << std::endl;
+    for(int i = 0; i < 5; i++)
+        std::cout << A[i] << " + " << B[i] << " * " << C[i] << " = " << D1[i] << std::endl;
+
+    // apply the transformation using zip_function
+#if THRUST_CPP_DIALECT >= 2011 && !defined(THRUST_LEGACY_GCC)
+    thrust::device_vector<float> D2(5);
+    thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(A.begin(), B.begin(), C.begin(), D2.begin())),
+                     thrust::make_zip_iterator(thrust::make_tuple(A.end(),   B.end(),   C.end(),   D2.end())),
+                     thrust::make_zip_function(arbitrary_functor2()));
 
     // print the output
+    std::cout << "N-ary functor" << std::endl;
     for(int i = 0; i < 5; i++)
-        std::cout << A[i] << " + " << B[i] << " * " << C[i] << " = " << D[i] << std::endl;
+        std::cout << A[i] << " + " << B[i] << " * " << C[i] << " = " << D2[i] << std::endl;
+#endif // >= C++11
 }
 
diff --git a/examples/cmake/CMakeLists.txt b/examples/cmake/CMakeLists.txt
new file mode 100644
index 000000000..25d2a2f95
--- /dev/null
+++ b/examples/cmake/CMakeLists.txt
@@ -0,0 +1,28 @@
+thrust_update_system_found_flags()
+
+set(extra_cmake_flags)
+
+# Need to pass these when testing NVC++.
+if ("NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}")
+  set(extra_cmake_flags
+    -D "CMAKE_CUDA_COMPILER_ID=${CMAKE_CUDA_COMPILER_ID}"
+    -D "CMAKE_CUDA_COMPILER_FORCED=${CMAKE_CUDA_COMPILER_FORCED}"
+  )
+endif()
+
+if (THRUST_CPP_FOUND AND THRUST_CUDA_FOUND)
+  # Do a basic check of the cmake/ThrustAddSubdir.cmake mechanism:
+  add_test(
+    NAME thrust.example.cmake.add_subdir
+    COMMAND "${CMAKE_COMMAND}"
+      --log-level=VERBOSE
+      -G "${CMAKE_GENERATOR}"
+      -S "${CMAKE_CURRENT_SOURCE_DIR}/add_subdir"
+      -B "${CMAKE_CURRENT_BINARY_DIR}/add_subdir"
+      -D "THRUST_ROOT=${Thrust_SOURCE_DIR}"
+      -D "CMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}"
+      -D "CMAKE_CUDA_COMPILER=${CMAKE_CUDA_COMPILER}"
+      -D "CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}"
+      ${extra_cmake_flags}
+  )
+endif()
diff --git a/examples/cmake/add_subdir/CMakeLists.txt b/examples/cmake/add_subdir/CMakeLists.txt
new file mode 100644
index 000000000..96283699f
--- /dev/null
+++ b/examples/cmake/add_subdir/CMakeLists.txt
@@ -0,0 +1,91 @@
+# This example demonstrates / tests adding thrust via a CMake add_subdirectory
+# call from a parent project.
+#
+# The variables THRUST_REQUIRED_SYSTEMS and THRUST_OPTIONAL_SYSTEMS must be
+# set prior to add_subdirectory(thrust), and afterwards the thrust_create_target
+# function may be used to create targets with the desired systems. See
+# NVIDIA/thrust/cmake/README.md for more details on thrust_create_target.
+
+cmake_minimum_required(VERSION 3.15)
+
+# Silence warnings about empty CUDA_ARCHITECTURES properties on example targets:
+if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.18)
+  cmake_policy(SET CMP0104 OLD)
+endif()
+
+project(ThrustAddSubDirExample CXX)
+
+# Add required Thrust systems to THRUST_REQUIRED_SYSTEMS.
+# Options are: CPP, CUDA, TBB or OMP.
+# Multiple systems may be specified.
+# An error is emitted if the system is not found.
+set(THRUST_REQUIRED_SYSTEMS CPP)
+
+# Add optional Thrust systems to THRUST_OPTIONAL_SYSTEMS.
+# Options are: CPP, CUDA, TBB or OMP.
+# Multiple systems may be specified.
+# No error is emitted if not found.
+set(THRUST_OPTIONAL_SYSTEMS CUDA)
+
+# Use your project's checkout of Thrust here, for most cases
+# `add_subdirectory(thrust)` will be sufficient.
+add_subdirectory("${THRUST_ROOT}" thrust)
+
+# Create a thrust target that only uses the serial CPP backend.
+# See thrust/thrust/cmake/README.md for details and additional options:
+thrust_create_target(ThrustCPP HOST CPP DEVICE CPP)
+
+# Create an executable that uses the CPP-only thrust target:
+add_executable(ExecWithCPP dummy.cpp)
+target_link_libraries(ExecWithCPP ThrustCPP)
+
+# To test for optional systems, first call thrust_update_system_found_flags to
+# set the THRUST_${system}_FOUND flags in current scope.
+# Required due to CMake scoping rules.
+thrust_update_system_found_flags()
+
+# Create and use a Thrust target configured to use CUDA acceleration if CUDA
+# is available:
+if (THRUST_CUDA_FOUND)
+  enable_language(CUDA)
+  thrust_create_target(ThrustCUDA HOST CPP DEVICE CUDA)
+  add_executable(ExecWithCUDA dummy.cu)
+  target_link_libraries(ExecWithCUDA ThrustCUDA)
+endif()
+
+#
+# Validation
+#
+
+function(assert_boolean var_name expect)
+  if (expect)
+    if (NOT ${var_name})
+      message(FATAL_ERROR "'${var_name}' is false, expected true.")
+    endif()
+  else()
+    if (${var_name})
+      message(FATAL_ERROR "'${var_name}' is true, expected false.")
+    endif()
+  endif()
+endfunction()
+
+function(assert_target target_name)
+  if (NOT TARGET "${target_name}")
+    message(FATAL_ERROR "Target '${target_name}' not defined.")
+  endif()
+endfunction()
+
+assert_boolean(THRUST_CPP_FOUND TRUE)
+assert_boolean(THRUST_CUDA_FOUND TRUE)
+assert_boolean(THRUST_OMP_FOUND FALSE)
+assert_boolean(THRUST_TBB_FOUND FALSE)
+
+assert_target(ThrustCPP)
+assert_target(ThrustCUDA)
+assert_target(ExecWithCPP)
+assert_target(ExecWithCUDA)
+
+thrust_debug_target(ThrustCPP "")
+thrust_debug_target(ThrustCUDA "")
+thrust_debug_target(ExecWithCPP "")
+thrust_debug_target(ExecWithCUDA "")
diff --git a/examples/cmake/add_subdir/dummy.cpp b/examples/cmake/add_subdir/dummy.cpp
new file mode 100644
index 000000000..ad7b9435f
--- /dev/null
+++ b/examples/cmake/add_subdir/dummy.cpp
@@ -0,0 +1,32 @@
+#include <thrust/detail/config.h>
+
+#include <iostream>
+
+int main()
+{
+  std::cout << "Hello from Thrust version " << THRUST_VERSION << ":\n"
+
+            << "Host system: "
+#if THRUST_HOST_SYSTEM == THRUST_HOST_SYSTEM_CPP
+            << "CPP\n"
+#elif THRUST_HOST_SYSTEM == THRUST_HOST_SYSTEM_OMP
+            << "OMP\n"
+#elif THRUST_HOST_SYSTEM == THRUST_HOST_SYSTEM_TBB
+            << "TBB\n"
+#else
+            << "Unknown\n"
+#endif
+
+            << "Device system: "
+#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CPP
+            << "CPP\n";
+#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+            << "CUDA\n";
+#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_OMP
+            << "OMP\n";
+#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_TBB
+            << "TBB\n";
+#else
+            << "Unknown\n";
+#endif
+}
diff --git a/examples/cmake/add_subdir/dummy.cu b/examples/cmake/add_subdir/dummy.cu
new file mode 100644
index 000000000..b5645fc3d
--- /dev/null
+++ b/examples/cmake/add_subdir/dummy.cu
@@ -0,0 +1 @@
+#include "dummy.cpp"
diff --git a/examples/cuda/CMakeLists.txt b/examples/cuda/CMakeLists.txt
new file mode 100644
index 000000000..bd72c58c0
--- /dev/null
+++ b/examples/cuda/CMakeLists.txt
@@ -0,0 +1,18 @@
+file(GLOB example_srcs
+  RELATIVE "${CMAKE_CURRENT_LIST_DIR}}"
+  CONFIGURE_DEPENDS
+  *.cu *.cpp
+)
+
+foreach(thrust_target IN LISTS THRUST_TARGETS)
+  thrust_get_target_property(config_device ${thrust_target} DEVICE)
+  if (NOT config_device STREQUAL "CUDA")
+    continue()
+  endif()
+
+  foreach(example_src IN LISTS example_srcs)
+    get_filename_component(example_name "${example_src}" NAME_WLE)
+    string(PREPEND example_name "cuda.")
+    thrust_add_example(example_target ${example_name} "${example_src}" ${thrust_target})
+  endforeach()
+endforeach()
diff --git a/examples/cuda/async_reduce.cu b/examples/cuda/async_reduce.cu
index ca21c88cb..6e1584bcc 100644
--- a/examples/cuda/async_reduce.cu
+++ b/examples/cuda/async_reduce.cu
@@ -1,9 +1,10 @@
+#include <thrust/detail/config.h>
 #include <thrust/device_vector.h>
 #include <thrust/reduce.h>
 #include <thrust/system/cuda/execution_policy.h>
 #include <cassert>
 
-#if __cplusplus >= 201103L
+#if THRUST_CPP_DIALECT >= 2011
 #include <future>
 #endif
 
@@ -20,11 +21,13 @@
 // std::future to wait for the result of the reduction. This method requires a compiler which supports
 // C++11-capable language and library constructs.
 
+#ifdef THRUST_EXAMPLE_DEVICE_SIDE
 template<typename Iterator, typename T, typename BinaryOperation, typename Pointer>
 __global__ void reduce_kernel(Iterator first, Iterator last, T init, BinaryOperation binary_op, Pointer result)
 {
   *result = thrust::reduce(thrust::cuda::par, first, last, init, binary_op);
 }
+#endif
 
 int main()
 {
@@ -39,7 +42,11 @@ int main()
   cudaStreamCreate(&s);
 
   // launch a CUDA kernel with only 1 thread on our stream
+#ifdef THRUST_EXAMPLE_DEVICE_SIDE
   reduce_kernel<<<1,1,0,s>>>(data.begin(), data.end(), 0, thrust::plus<int>(), result.data());
+#else
+  result[0] = thrust::reduce(thrust::cuda::par, data.begin(), data.end(), 0, thrust::plus<int>());
+#endif
 
   // wait for the stream to finish
   cudaStreamSynchronize(s);
@@ -52,7 +59,7 @@ int main()
   // reset the result
   result[0] = 0;
 
-#if __cplusplus >= 201103L
+#if THRUST_CPP_DIALECT >= 2011
   // method 2: use std::async to create asynchrony
 
   // copy all the algorithm parameters
diff --git a/examples/cuda/custom_temporary_allocation.cu b/examples/cuda/custom_temporary_allocation.cu
index fe08e5f95..7bba0fa9e 100644
--- a/examples/cuda/custom_temporary_allocation.cu
+++ b/examples/cuda/custom_temporary_allocation.cu
@@ -10,13 +10,16 @@
 #include <map>
 #include <cassert>
 
-// This example demonstrates how to intercept calls to get_temporary_buffer
-// and return_temporary_buffer to control how Thrust allocates temporary storage
-// during algorithms such as thrust::sort. The idea will be to create a simple
-// cache of allocations to search when temporary storage is requested. If a hit
-// is found in the cache, we quickly return the cached allocation instead of
-// resorting to the more expensive thrust::cuda::malloc.
-//
+// This example demonstrates how to control how Thrust allocates temporary
+// storage during algorithms such as thrust::sort. The idea will be to create a
+// simple cache of allocations to search when temporary storage is requested.
+// If a hit is found in the cache, we quickly return the cached allocation
+// instead of resorting to the more expensive thrust::cuda::malloc.
+
+// Note: Thrust now has its own caching allocator layer; if you just need a
+// caching allocator, you ought to use that. This example is still useful
+// as a demonstration of how to use a Thrust custom allocator.
+
 // Note: this implementation cached_allocator is not thread-safe. If multiple
 // (host) threads use the same cached_allocator then they should gain exclusive
 // access to the allocator before accessing its methods.
diff --git a/examples/cuda/explicit_cuda_stream.cu b/examples/cuda/explicit_cuda_stream.cu
new file mode 100644
index 000000000..303a14723
--- /dev/null
+++ b/examples/cuda/explicit_cuda_stream.cu
@@ -0,0 +1,80 @@
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h> // For thrust::device
+#include <thrust/reduce.h>
+#include <thrust/sequence.h>
+
+#include <cuda_runtime.h>
+
+#include <iostream>
+
+// This example shows how to execute a Thrust device algorithm on an explicit
+// CUDA stream. The simple program below fills a vector with the numbers
+// [0, 1000) (thrust::sequence) and then performs a scan operation
+// (thrust::inclusive_scan) on them. Both algorithms are executed on the same
+// custom CUDA stream using the CUDA execution policies.
+//
+// Thrust provides two execution policies that accept CUDA streams that differ
+// in when/if they synchronize the stream:
+// 1. thrust::cuda::par.on(stream)
+//      - `stream` will *always* be synchronized before an algorithm returns.
+//      - This is the default `thrust::device` policy when compiling with the
+//        CUDA device backend.
+// 2. thrust::cuda::par_nosync.on(stream)
+//      - `stream` will only be synchronized when necessary for correctness
+//        (e.g., returning a result from `thrust::reduce`). This is a hint that
+//        may be ignored by an algorithm's implementation.
+
+int main()
+{
+  thrust::device_vector<int> d_vec(1000);
+
+  // Create the stream:
+  cudaStream_t custom_stream;
+  cudaError_t err = cudaStreamCreate(&custom_stream);
+  if (err != cudaSuccess)
+  {
+    std::cerr << "Error creating stream: " << cudaGetErrorString(err) << "\n";
+    return 1;
+  }
+
+  // Construct a new `nosync` execution policy with the custom stream
+  auto nosync_exec_policy = thrust::cuda::par_nosync.on(custom_stream);
+
+  // Fill the vector with sequential data.
+  // This will execute using the custom stream and the stream will *not* be
+  // synchronized before the function returns, meaning asynchronous work may
+  // still be executing after returning and the contents of `d_vec` are
+  // undefined. Synchronization is not needed here because the following
+  // `inclusive_scan` is executed on the same stream and is therefore guaranteed
+  // to be ordered after the `sequence`
+  thrust::sequence(nosync_exec_policy, d_vec.begin(), d_vec.end());
+
+  // Construct a new *synchronous* execution policy with the same custom stream
+  auto sync_exec_policy = thrust::cuda::par.on(custom_stream);
+
+  // Compute in-place inclusive sum scan of data in the vector.
+  // This also executes in the custom stream, but the execution policy ensures
+  // the stream is synchronized before the algorithm returns. This guarantees
+  // there is no pending asynchronous work and the contents of `d_vec` are
+  // immediately accessible.
+  thrust::inclusive_scan(sync_exec_policy,
+                         d_vec.cbegin(),
+                         d_vec.cend(),
+                         d_vec.begin());
+
+  // This access is only valid because the stream has been synchronized
+  int sum = d_vec.back();
+
+  // Free the stream:
+  err = cudaStreamDestroy(custom_stream);
+  if (err != cudaSuccess)
+  {
+    std::cerr << "Error destroying stream: " << cudaGetErrorString(err) << "\n";
+    return 1;
+  }
+
+  // Print the sum:
+  std::cout << "sum is " << sum << std::endl;
+
+  return 0;
+}
diff --git a/examples/cuda/global_device_vector.cu b/examples/cuda/global_device_vector.cu
index 1419cae62..a99566796 100644
--- a/examples/cuda/global_device_vector.cu
+++ b/examples/cuda/global_device_vector.cu
@@ -1,3 +1,4 @@
+#include <thrust/detail/config.h>
 #include <thrust/device_vector.h>
 
 // If you create a global `thrust::device_vector` with the default allocator,
@@ -20,7 +21,7 @@ typedef thrust::system::cuda::detail::cuda_memory_resource<
   thrust::cuda::pointer<void>
 > device_ignore_shutdown_memory_resource;
 
-#if __cplusplus >= 201103L
+#if THRUST_CPP_DIALECT >= 2011
   template <typename T>
   using device_ignore_shutdown_allocator = 
     thrust::mr::stateless_resource_allocator<
diff --git a/examples/cuda/range_view.cu b/examples/cuda/range_view.cu
index e863a6199..2ede62047 100644
--- a/examples/cuda/range_view.cu
+++ b/examples/cuda/range_view.cu
@@ -226,7 +226,7 @@ int main()
 
   // print values from original device_vector<float> Z 
   // to ensure that range view was mapped to this vector
-  for (int i = 0, n = Z.size(); i < n; ++i)
+  for (std::size_t i = 0, n = Z.size(); i < n; ++i)
   {
     cout << "z[" << i << "]= " << Z[i] << endl;
   }
diff --git a/examples/discrete_voronoi.cu b/examples/discrete_voronoi.cu
index 93e7e5622..bfbf2242d 100644
--- a/examples/discrete_voronoi.cu
+++ b/examples/discrete_voronoi.cu
@@ -4,10 +4,10 @@
 #include <thrust/extrema.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/random.h>
-#include <iostream>
 
+#include <iostream>
 #include <iomanip>
-#include <stdio.h>
+#include <fstream>
 #include <cmath>
 
 #include "include/timer.h"
@@ -135,21 +135,26 @@ void generate_random_sites(thrust::host_vector<int> &t, int Nb, int m, int n)
 //Export the tab to PGM image format
 void vector_to_pgm(thrust::host_vector<int> &t, int m, int n, const char *out)
 {
-    FILE *f;
+    assert(static_cast<int>(t.size()) == m * n &&
+           "Vector size does not match image dims.");
 
-    f=fopen(out,"w+t");
-    fprintf(f,"P2\n");
-    fprintf(f,"%d %d\n 253\n",m,n);
+    std::fstream f(out, std::fstream::out);
+    f << "P2\n";
+    f << m << " " << n << "\n";
+    f << "253\n";
+
+    //Hash function to map values to [0,255]
+    auto to_grey_level = [](int in_value) -> int
+    {
+        return (71 * in_value) % 253;
+    };
 
-    for(int j = 0; j < n ; j++)
+    for (int value : t)
     {
-        for(int i = 0; i < m ; i++)
-        {
-            fprintf(f,"%d ",(int)(71*t[j*m+i])%253); //Hash function to map values to [0,255]
-        }
+      f << to_grey_level(value) << " ";
     }
-    fprintf(f,"\n");
-    fclose(f);
+    f << "\n";
+    f.close();
 }
 
 /************Main Jfa loop********************/
diff --git a/examples/dot_products_with_zip.cu b/examples/dot_products_with_zip.cu
index 52e33d8e6..81ff7ac12 100644
--- a/examples/dot_products_with_zip.cu
+++ b/examples/dot_products_with_zip.cu
@@ -6,9 +6,9 @@
 #include <thrust/random.h>
 
 
-// This example shows how thrust::zip_iterator can be used to create a 
-// 'virtual' array of structures.  In this case the structure is a 3d 
-// vector type (Float3) whose (x,y,z) components will be stored in 
+// This example shows how thrust::zip_iterator can be used to create a
+// 'virtual' array of structures.  In this case the structure is a 3d
+// vector type (Float3) whose (x,y,z) components will be stored in
 // three separate float arrays.  The zip_iterator "zips" these arrays
 // into a single virtual Float3 array.
 
@@ -54,17 +54,17 @@ int main(void)
     // We'll store the components of the 3d vectors in separate arrays. One set of
     // arrays will store the 'A' vectors and another set will store the 'B' vectors.
 
-    // This 'structure of arrays' (SoA) approach is usually more efficient than the 
+    // This 'structure of arrays' (SoA) approach is usually more efficient than the
     // 'array of structures' (AoS) approach.  The primary reason is that structures,
     // like Float3, don't always obey the memory coalescing rules, so they are not
     // efficiently transferred to and from memory.  Another reason to prefer SoA to
     // AoS is that we don't aways want to process all members of the structure.  For
-    // example, if we only need to look at first element of the structure then it 
+    // example, if we only need to look at first element of the structure then it
     // is wasteful to load the entire structure from memory.  With the SoA approach,
     // we can chose which elements of the structure we wish to read.
 
     thrust::device_vector<float> A0 = random_vector(N);  // x components of the 'A' vectors
-    thrust::device_vector<float> A1 = random_vector(N);  // y components of the 'A' vectors 
+    thrust::device_vector<float> A1 = random_vector(N);  // y components of the 'A' vectors
     thrust::device_vector<float> A2 = random_vector(N);  // z components of the 'A' vectors
 
     thrust::device_vector<float> B0 = random_vector(N);  // x components of the 'B' vectors
@@ -78,7 +78,7 @@ int main(void)
     // We'll now illustrate two ways to use zip_iterator to compute the dot
     // products.  The first method is verbose but shows how the parts fit together.
     // The second method hides these details and is more concise.
-   
+
 
     // METHOD #1
     // Defining a zip_iterator type can be a little cumbersome ...
@@ -87,24 +87,24 @@ int main(void)
     typedef thrust::zip_iterator<FloatIteratorTuple>                   Float3Iterator;
 
     // Now we'll create some zip_iterators for A and B
-    Float3Iterator A_first = thrust::make_zip_iterator(make_tuple(A0.begin(), A1.begin(), A2.begin()));
-    Float3Iterator A_last  = thrust::make_zip_iterator(make_tuple(A0.end(),   A1.end(),   A2.end()));
-    Float3Iterator B_first = thrust::make_zip_iterator(make_tuple(B0.begin(), B1.begin(), B2.begin()));
-                            
+    Float3Iterator A_first = thrust::make_zip_iterator(thrust::make_tuple(A0.begin(), A1.begin(), A2.begin()));
+    Float3Iterator A_last  = thrust::make_zip_iterator(thrust::make_tuple(A0.end(),   A1.end(),   A2.end()));
+    Float3Iterator B_first = thrust::make_zip_iterator(thrust::make_tuple(B0.begin(), B1.begin(), B2.begin()));
+
     // Finally, we pass the zip_iterators into transform() as if they
     // were 'normal' iterators for a device_vector<Float3>.
     thrust::transform(A_first, A_last, B_first, result.begin(), DotProduct());
 
 
     // METHOD #2
-    // Alternatively, we can avoid creating variables for X_first, X_last, 
+    // Alternatively, we can avoid creating variables for X_first, X_last,
     // and Y_first and invoke transform() directly.
-    thrust::transform( thrust::make_zip_iterator(make_tuple(A0.begin(), A1.begin(), A2.begin())),
-                       thrust::make_zip_iterator(make_tuple(A0.end(),   A1.end(),   A2.end())),
-                       thrust::make_zip_iterator(make_tuple(B0.begin(), B1.begin(), B2.begin())),
+    thrust::transform( thrust::make_zip_iterator(thrust::make_tuple(A0.begin(), A1.begin(), A2.begin())),
+                       thrust::make_zip_iterator(thrust::make_tuple(A0.end(),   A1.end(),   A2.end())),
+                       thrust::make_zip_iterator(thrust::make_tuple(B0.begin(), B1.begin(), B2.begin())),
                        result.begin(),
                        DotProduct() );
-    
+
 
 
     // Finally, we'll print a few results
@@ -126,8 +126,8 @@ int main(void)
         std::cout << "(" << thrust::get<0>(b) << "," << thrust::get<1>(b) << "," << thrust::get<2>(b) << ")";
         std::cout << " = ";
         std::cout << dot << std::endl;
-    }   
+    }
 
     return 0;
 }
- 
+
diff --git a/examples/expand.cu b/examples/expand.cu
index 4547bcd13..f61edec8f 100644
--- a/examples/expand.cu
+++ b/examples/expand.cu
@@ -51,7 +51,6 @@ OutputIterator expand(InputIterator1 first1,
      thrust::maximum<difference_type>());
 
   // gather input values according to index array (output = first2[output_indices])
-  OutputIterator output_end = output; thrust::advance(output_end, output_size);
   thrust::gather(output_indices.begin(),
                  output_indices.end(),
                  first2,
diff --git a/examples/raw_reference_cast.cu b/examples/raw_reference_cast.cu
index ec9a9783f..d6c854590 100644
--- a/examples/raw_reference_cast.cu
+++ b/examples/raw_reference_cast.cu
@@ -84,11 +84,9 @@ int main(void)
   typedef Vector::iterator           Iterator;
   typedef thrust::device_system_tag  System;
 
-  size_t N = 5;
-
   // allocate device memory
-  Vector A(N);
-  Vector B(N);
+  Vector A(5);
+  Vector B(5);
 
   // initialize A and B
   thrust::sequence(A.begin(), A.end());
@@ -100,7 +98,7 @@ int main(void)
 
   // note: we must specify the System to ensure correct execution
   thrust::for_each(thrust::counting_iterator<int,System>(0),
-                   thrust::counting_iterator<int,System>(N),
+                   thrust::counting_iterator<int,System>(5),
                    copy_iterators<Iterator,Iterator>(A.begin(), B.begin()));
   
   std::cout << "After A->B Copy" << std::endl;
diff --git a/examples/scan_matrix_by_rows.cu b/examples/scan_matrix_by_rows.cu
index df303d8bd..2cf1986e9 100644
--- a/examples/scan_matrix_by_rows.cu
+++ b/examples/scan_matrix_by_rows.cu
@@ -1,5 +1,6 @@
 #include <thrust/device_vector.h>
 #include <thrust/scan.h>
+#include <thrust/sequence.h>
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
 
@@ -20,7 +21,7 @@ void scan_matrix_by_rows0(thrust::device_vector<int>& u, int n, int m) {
 
 // We can batch the operation using `thrust::inclusive_scan_by_key`, which
 // scans each group of consecutive equal keys. All we need to do is generate
-// the right key sequence. We want the keys for elements on the same row to 
+// the right key sequence. We want the keys for elements on the same row to
 // be identical.
 
 // So first, we define an unary function object which takes the index of an
diff --git a/examples/sort.cu b/examples/sort.cu
index 700fc5f3f..1bbb5d897 100644
--- a/examples/sort.cu
+++ b/examples/sort.cu
@@ -41,7 +41,7 @@ void initialize(thrust::device_vector<int>& v1, thrust::device_vector<int>& v2)
   for(size_t i = 0; i < v1.size(); i++)
   {
     v1[i] = dist(rng);
-    v2[i] = i;
+    v2[i] = static_cast<int>(i);
   }
 }
 
diff --git a/examples/sorting_aos_vs_soa.cu b/examples/sorting_aos_vs_soa.cu
index 1bf990982..649a78ab1 100644
--- a/examples/sorting_aos_vs_soa.cu
+++ b/examples/sorting_aos_vs_soa.cu
@@ -1,3 +1,4 @@
+#include <thrust/host_vector.h>
 #include <thrust/device_vector.h>
 #include <thrust/sort.h>
 #include <thrust/random.h>
@@ -7,7 +8,7 @@
 
 // This examples compares sorting performance using Array of Structures (AoS)
 // and Structure of Arrays (SoA) data layout.  Legacy applications will often
-// store data in C/C++ structs, such as MyStruct defined below.  Although 
+// store data in C/C++ structs, such as MyStruct defined below.  Although
 // Thrust can process array of structs, it is typically less efficient than
 // the equivalent structure of arrays layout.  In this particular example,
 // the optimized SoA approach is approximately *five times faster* than the
@@ -57,7 +58,7 @@ int main(void)
 {
   size_t N = 2 * 1024 * 1024;
 
-  // Sort Key-Value pairs using Array of Structures (AoS) storage 
+  // Sort Key-Value pairs using Array of Structures (AoS) storage
   {
     thrust::device_vector<MyStruct> structures(N);
 
@@ -71,7 +72,7 @@ int main(void)
     std::cout << "AoS sort took " << 1e3 * t.elapsed() << " milliseconds" << std::endl;
   }
 
-  // Sort Key-Value pairs using Structure of Arrays (SoA) storage 
+  // Sort Key-Value pairs using Structure of Arrays (SoA) storage
   {
     thrust::device_vector<int>   keys(N);
     thrust::device_vector<float> values(N);
diff --git a/examples/sparse_vector.cu b/examples/sparse_vector.cu
index c7528cff2..463bfa008 100644
--- a/examples/sparse_vector.cu
+++ b/examples/sparse_vector.cu
@@ -11,7 +11,6 @@ template <typename IndexVector,
 void print_sparse_vector(const IndexVector& A_index,
                          const ValueVector& A_value)
 {
-    // sanity test
     assert(A_index.size() == A_value.size());
 
     for(size_t i = 0; i < A_index.size(); i++)
@@ -35,7 +34,6 @@ void sum_sparse_vectors(const IndexVector1& A_index,
     typedef typename IndexVector3::value_type  IndexType;
     typedef typename ValueVector3::value_type  ValueType;
 
-    // sanity test
     assert(A_index.size() == A_value.size());
     assert(B_index.size() == B_value.size());
 
@@ -53,7 +51,7 @@ void sum_sparse_vectors(const IndexVector1& A_index,
                          B_value.begin(),
                          temp_index.begin(),
                          temp_value.begin());
-    
+
     // compute number of unique indices
     size_t C_size = thrust::inner_product(temp_index.begin(), temp_index.end() - 1,
                                           temp_index.begin() + 1,
@@ -83,7 +81,7 @@ int main(void)
     A_index[1] = 3;  A_value[1] = 60;
     A_index[2] = 5;  A_value[2] = 20;
     A_index[3] = 8;  A_value[3] = 40;
-    
+
     // initialize sparse vector B with 6 elements
     thrust::device_vector<int>   B_index(6);
     thrust::device_vector<float> B_value(6);
@@ -97,7 +95,7 @@ int main(void)
     // compute sparse vector C = A + B
     thrust::device_vector<int>   C_index;
     thrust::device_vector<float> C_value;
-    
+
     sum_sparse_vectors(A_index, A_value, B_index, B_value, C_index, C_value);
 
     std::cout << "Computing C = A + B for sparse vectors A and B" << std::endl;
diff --git a/examples/transform_input_output_iterator.cu b/examples/transform_input_output_iterator.cu
new file mode 100644
index 000000000..afdccc35a
--- /dev/null
+++ b/examples/transform_input_output_iterator.cu
@@ -0,0 +1,111 @@
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+#include <thrust/functional.h>
+#include <thrust/gather.h>
+#include <thrust/iterator/transform_input_output_iterator.h>
+#include <thrust/sequence.h>
+#include <iostream>
+
+// Base 2 fixed point
+class ScaledInteger
+{
+  int value_;
+  int scale_;
+
+public:
+  __host__ __device__
+  ScaledInteger(int value, int scale): value_{value}, scale_{scale} {}
+
+  __host__ __device__
+  int value() const { return value_; }
+
+  __host__ __device__
+  ScaledInteger rescale(int scale) const
+  {
+    int shift = scale - scale_;
+    int result = shift < 0 ? value_ << (-shift) : value_ >> shift;
+    return ScaledInteger{result, scale};
+  }
+
+  __host__ __device__
+  friend ScaledInteger operator+(ScaledInteger a, ScaledInteger b)
+  {
+    // Rescale inputs to the lesser of the two scales
+    if (b.scale_ < a.scale_)
+      a = a.rescale(b.scale_);
+    else if (a.scale_ < b.scale_)
+      b = b.rescale(a.scale_);
+    return ScaledInteger{a.value_ + b.value_, a.scale_};
+  }
+};
+
+struct ValueToScaledInteger
+{
+  int scale;
+
+  __host__ __device__
+  ScaledInteger operator()(const int& value) const
+  {
+    return ScaledInteger{value, scale};
+  }
+};
+
+struct ScaledIntegerToValue
+{
+  int scale;
+
+  __host__ __device__
+  int operator()(const ScaledInteger& scaled) const
+  {
+    return scaled.rescale(scale).value();
+  }
+};
+
+int main(void)
+{
+  const size_t size = 4;
+  thrust::device_vector<int> A(size);
+  thrust::device_vector<int> B(size);
+  thrust::device_vector<int> C(size);
+
+  thrust::sequence(A.begin(), A.end(), 1);
+  thrust::sequence(B.begin(), B.end(), 5);
+
+  const int A_scale = 16; // Values in A are left shifted by 16
+  const int B_scale = 8;  // Values in B are left shifted by 8
+  const int C_scale = 4;  // Values in C are left shifted by 4
+
+  auto A_begin = thrust::make_transform_input_output_iterator(A.begin(),
+                    ValueToScaledInteger{A_scale}, ScaledIntegerToValue{A_scale});
+  auto A_end   = thrust::make_transform_input_output_iterator(A.end(),
+                    ValueToScaledInteger{A_scale}, ScaledIntegerToValue{A_scale});
+  auto B_begin = thrust::make_transform_input_output_iterator(B.begin(),
+                    ValueToScaledInteger{B_scale}, ScaledIntegerToValue{B_scale});
+  auto C_begin = thrust::make_transform_input_output_iterator(C.begin(),
+                    ValueToScaledInteger{C_scale}, ScaledIntegerToValue{C_scale});
+
+  // Sum A and B as ScaledIntegers, storing the scaled result in C
+  thrust::transform(A_begin, A_end, B_begin, C_begin, thrust::plus<ScaledInteger>{});
+
+  thrust::host_vector<int> A_h(A);
+  thrust::host_vector<int> B_h(B);
+  thrust::host_vector<int> C_h(C);
+
+  std::cout << std::hex;
+
+  std::cout << "Expected [ ";
+  for (size_t i = 0; i < size; i++) {
+    const int expected = ((A_h[i] << A_scale) + (B_h[i] << B_scale)) >> C_scale;
+    std::cout << expected <<  " ";
+  }
+  std::cout << "] \n";
+
+  std::cout << "Result   [ ";
+  for (size_t i = 0; i < size; i++) {
+    std::cout << C_h[i] <<  " ";
+  }
+  std::cout << "] \n";
+
+  return 0;
+}
+
diff --git a/examples/uninitialized_vector.cu b/examples/uninitialized_vector.cu
index 5f522a809..90e8141fa 100644
--- a/examples/uninitialized_vector.cu
+++ b/examples/uninitialized_vector.cu
@@ -29,6 +29,10 @@ template<typename T>
   __host__
   ~uninitialized_allocator() {}
 
+#if THRUST_CPP_DIALECT >= 2011
+  uninitialized_allocator & operator=(const uninitialized_allocator &) = default;
+#endif
+
   // for correctness, you should also redefine rebind when you inherit
   // from an allocator type; this way, if the allocator is rebound somewhere,
   // it's going to be rebound to the correct type - and not to its base
diff --git a/generate_mk.py b/generate_mk.py
index 46042036c..84071338c 100755
--- a/generate_mk.py
+++ b/generate_mk.py
@@ -6,6 +6,7 @@
 #   A single example or unit test source file generates its own executable
 #   This program is called by a top level Makefile, but can also be used stand-alone for debugging
 #   This program also generates testing.mk, examples.mk and dependencies.mk
+from __future__ import print_function
 import sys
 import shutil as sh
 import os
@@ -31,7 +32,7 @@ def Glob(pattern, directory,exclude='\B'):
 
 
 def generate_test_mk(mk_path, test_path, group, TEST_DIR):
-    print 'Generating makefiles in "'+mk_path+'" for tests in "'+test_path+'"'
+    print('Generating makefiles in "'+mk_path+'" for tests in "'+test_path+'"')
     src_cu  = Glob("*.cu",  test_path, ".*testframework.cu$")
     src_cxx = Glob("*.cpp", test_path)
     src_cu.sort();
@@ -52,7 +53,7 @@ def generate_test_mk(mk_path, test_path, group, TEST_DIR):
     return [tests_all, dependencies_all]
 
 def generate_example_mk(mk_path, example_path, group, EXAMPLE_DIR):
-    print 'Generating makefiles in "'+mk_path+'" for examples in "'+example_path+'"'
+    print('Generating makefiles in "'+mk_path+'" for examples in "'+example_path+'"')
     src_cu  = Glob("*.cu",  example_path)
     src_cxx = Glob("*.cpp", example_path)
     src_cu.sort();
diff --git a/internal/benchmark/CMakeLists.txt b/internal/benchmark/CMakeLists.txt
new file mode 100644
index 000000000..8c59747b8
--- /dev/null
+++ b/internal/benchmark/CMakeLists.txt
@@ -0,0 +1,30 @@
+if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
+  # MSVC builds fail at runtime. Benchmarks are linux-only for now.
+  message(STATUS "Thrust benchmarking is not available on MSVC.")
+  return()
+endif()
+
+add_custom_target(thrust.all.bench)
+
+foreach(thrust_target IN LISTS THRUST_TARGETS)
+  thrust_get_target_property(config_host ${thrust_target} HOST)
+  thrust_get_target_property(config_device ${thrust_target} DEVICE)
+  thrust_get_target_property(config_prefix ${thrust_target} PREFIX)
+
+  # Skip non cpp.cuda targets:
+  if (NOT config_host   STREQUAL "CPP" OR
+      NOT config_device STREQUAL "CUDA")
+    continue()
+  endif()
+
+  set(bench_target ${config_prefix}.bench)
+
+  add_executable(${bench_target} bench.cu)
+  target_link_libraries(${bench_target} PRIVATE ${thrust_target})
+  target_include_directories(${bench_target} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}")
+  thrust_clone_target_properties(${bench_target} ${thrust_target})
+  thrust_fix_clang_nvcc_build_for(${bench_target})
+
+  add_dependencies(thrust.all.bench ${bench_target})
+  add_dependencies(${config_prefix}.all ${bench_target})
+endforeach()
diff --git a/internal/benchmark/bench.cu b/internal/benchmark/bench.cu
index eba49f608..38d1d647a 100644
--- a/internal/benchmark/bench.cu
+++ b/internal/benchmark/bench.cu
@@ -4,6 +4,14 @@
 #include <thrust/sort.h>
 #include <thrust/reduce.h>
 #include <thrust/scan.h>
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2011
+#include <thrust/random.h>
+#include <thrust/shuffle.h>
+
+#include <random>
+#endif
 
 #include <algorithm>
 #include <numeric>
@@ -42,7 +50,7 @@
 
 // We don't use THRUST_NOEXCEPT because it's new, and we want this benchmark to
 // be backwards-compatible to older versions of Thrust.
-#if __cplusplus >= 201103L
+#if THRUST_CPP_DIALECT >= 2011
   #define NOEXCEPT noexcept
 #else
   #define NOEXCEPT throw()
@@ -393,7 +401,6 @@ struct experiment_driver
     );
     #endif
 
-/*
     stl_average_walltime = round_to_precision(
         stl_average_walltime, stl_walltime_precision
     );
@@ -417,7 +424,6 @@ struct experiment_driver
         tbb_walltime_uncertainty, tbb_walltime_precision
     );
     #endif
-*/
 
     // Round the average throughput and throughput uncertainty to the
     // significant figure of the throughput uncertainty.
@@ -436,7 +442,6 @@ struct experiment_driver
     );
     #endif
 
-/*
     stl_average_throughput = round_to_precision(
         stl_average_throughput, stl_throughput_precision
     );
@@ -460,7 +465,6 @@ struct experiment_driver
         tbb_throughput_uncertainty, tbb_throughput_precision
     );
     #endif
-*/
 
     std::cout << THRUST_VERSION                // Thrust Version.
       << ","  << test_name                     // Algorithm.
@@ -695,6 +699,21 @@ struct copy_trial_base : trial_base<TrialKind>
   }
 };
 
+#if THRUST_CPP_DIALECT >= 2011
+template <typename Container, typename TrialKind = regular_trial>
+struct shuffle_trial_base : trial_base<TrialKind>
+{
+  Container input;
+
+  void setup(uint64_t elements)
+  {
+    input.resize(elements);
+
+    randomize(input);
+  }
+};
+#endif
+
 ///////////////////////////////////////////////////////////////////////////////
 
 template <typename T>
@@ -890,6 +909,37 @@ struct copy_tester
   #endif
 };
 
+#if THRUST_CPP_DIALECT >= 2011
+template <typename T>
+struct shuffle_tester
+{
+  static char const* test_name() { return "shuffle"; }
+
+  struct std_trial : shuffle_trial_base<std::vector<T>, baseline_trial>
+  {
+    std::default_random_engine g;
+    void operator()()
+    {
+      std::shuffle(this->input.begin(), this->input.end(), this->g);
+    }
+  };
+
+  struct thrust_trial : shuffle_trial_base<thrust::device_vector<T> >
+  {
+    thrust::default_random_engine g;
+    void operator()()
+    {
+      thrust::shuffle(this->input.begin(), this->input.end(), this->g);
+      #if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+        cudaError_t err = cudaDeviceSynchronize();
+        if (err != cudaSuccess)
+          throw thrust::error_code(err, thrust::cuda_category());
+      #endif
+    }
+  };
+};
+#endif
+
 ///////////////////////////////////////////////////////////////////////////////
 
 template <
@@ -941,6 +991,14 @@ void run_core_primitives_experiments_for_type()
     , BaselineTrials
     , RegularTrials
   >::run_experiment();
+
+  experiment_driver<
+      shuffle_tester
+    , ElementMetaType
+    , Elements / sizeof(typename ElementMetaType::type)
+    , BaselineTrials
+    , RegularTrials
+  >::run_experiment();
 }
 
 ///////////////////////////////////////////////////////////////////////////////
diff --git a/internal/build/common_build.mk b/internal/build/common_build.mk
index a77a5e940..25cee6bb4 100644
--- a/internal/build/common_build.mk
+++ b/internal/build/common_build.mk
@@ -1,41 +1,18 @@
 USE_NEW_PROJECT_MK := 1
 
+CCCL_ENABLE_DEPRECATIONS := 1
+
 ifeq ($(OS),Linux)
   LIBRARIES += m
 endif
 
-include $(ROOTDIR)/thrust/internal/build/common_warnings.mk
+include $(ROOTDIR)/thrust/internal/build/common_compiler.mk
 
 # Add /bigobj to Windows build flag to workaround building Thrust with debug
 ifeq ($(OS),win32)
   CUDACC_FLAGS += -Xcompiler "/bigobj"
 endif
 
-ARCH_NEG_FILTER += 20 21
-# Determine which SASS to generate
-# if DVS (either per-CL or on-demand)
-ifneq ($(or $(THRUST_DVS),$(THRUST_DVS_NIGHTLY)),)
-  # DVS doesn't run Thrust on fermi so filter out SM 2.0/2.1
-  # DVS doesn't run Thrust on mobile so filter those out as well
-  # DVS doesn't have PASCAL configs at the moment
-  ARCH_NEG_FILTER += 20 21 32 37 53 60
-else
-  # If building for ARMv7 (32-bit ARM), build only mobile SASS since no dGPU+ARM32 are supported anymore
-  ifeq ($(TARGET_ARCH),ARMv7)
-    ARCH_FILTER = 32 53 62
-  endif
-  # If its androideabi, we know its mobile, so can target specific SASS
-  ifeq ($(OS),Linux)
-    ifeq ($(ABITYPE), androideabi)
-     ARCH_FILTER = 32 53 62
-     ifeq ($(THRUST_TEST),1)
-       NVCC_OPTIONS += -include "$(ROOTDIR)/cuda/tools/demangler/demangler.h"
-       LIBRARIES += demangler
-     endif
-    endif
-  endif
-endif
-
 # Add -mthumb for Linux on ARM to work around bug in arm cross compiler from p4
 ifeq ($(TARGET_ARCH),ARMv7)
   ifneq ($(HOST_ARCH),ARMv7)
@@ -80,8 +57,15 @@ ifndef BUILD_AGAINST_RELEASE
   else
     INCLUDES_ABSPATH += $(ROOTDIR)/thrust
   endif
+
+  # CUB includes
+  ifdef VULCAN
+    INCLUDES_ABSPATH += $(VULCAN_TOOLKIT_BASE)/cub
+  else
+    INCLUDES_ABSPATH += $(ROOTDIR)/cub
+  endif
 else
-  # CUDA and Thrust includes
+  # CUDA, CUB, and Thrust includes
   INCLUDES_ABSPATH += $(GPGPU_COMPILER_EXPORT)/include
 
   ifeq ($(TARGET_ARCH),ARMv7)
@@ -95,6 +79,8 @@ ifdef VULCAN
   LIBDIRS_ABSPATH  += $(VULCAN_BUILD_DIR)/bin/$(VULCAN_ARCH)_$(VULCAN_OS)$(VULCAN_ABI)_$(VULCAN_BUILD)
 endif
 
+USES_CUDA_DRIVER_HEADERS := 1
+
 ifdef VULCAN_TOOLKIT_BASE
   include $(VULCAN_TOOLKIT_BASE)/build/common.mk
 else
diff --git a/internal/build/common_warnings.mk b/internal/build/common_compiler.mk
similarity index 64%
rename from internal/build/common_warnings.mk
rename to internal/build/common_compiler.mk
index 7809d3752..020159365 100644
--- a/internal/build/common_warnings.mk
+++ b/internal/build/common_compiler.mk
@@ -3,10 +3,18 @@ ifeq ($(OS),$(filter $(OS),Linux Darwin))
     CUDACC_FLAGS += -Xcompiler "-Wall -Wextra -Werror"
 
     ifdef USEXLC
+      CXX_STD := c++14
+
       # GCC does not warn about unused parameters in uninstantiated
       # template functions, but xlC does. This causes xlC to choke on the
       # OMP backend, which is mostly #ifdef'd out when you aren't using it.
       CUDACC_FLAGS += -Xcompiler "-Wno-unused-parameter"
+
+      # xlC is unreasonable about unused functions in a translation unit
+      # when this warning is enabled; this includes warning on most functions
+      # that are defined as static inline in cuda_fp16.h. Disable this warning
+      # entirely under xlC.
+      CUDACC_FLAGS += -Xcompiler "-Wno-unused-function"
     else # GCC, ICC or Clang AKA the sane ones.
       # XXX Enable -Wcast-align.
       CUDACC_FLAGS += -Xcompiler "-Winit-self -Woverloaded-virtual -Wno-cast-align -Wcast-qual -Wno-long-long -Wno-variadic-macros -Wno-unused-function"
@@ -26,6 +34,8 @@ ifeq ($(OS),$(filter $(OS),Linux Darwin))
       endif
 
       ifdef IS_CLANG
+        CXX_STD := c++14
+
         ifdef USE_CLANGLLVM
           CLANG_VERSION = $(shell $(USE_CLANGLLVM) --version 2>/dev/null | head -1 | sed -e 's/.*\([0-9]\)\.\([0-9]\)\(\.[0-9]\).*/\1\2/g')
         else
@@ -66,35 +76,34 @@ ifeq ($(OS),$(filter $(OS),Linux Darwin))
             GCC_VERSION = $(shell $(CCBIN_ENVIRONMENT) $(CCBIN) -dumpversion | sed -e 's/\([0-9]\)\.\([0-9]\)\(\.[0-9]\)\?/\1\2/g')
           endif
 
-          ifeq ($(shell if test $(GCC_VERSION) -lt 42; then echo true; fi),true)
-            # In GCC 4.1.2 and older, numeric conversion warnings are not
-            # suppressable, so shut off -Wno-error.
-            CUDACC_FLAGS += -Xcompiler "-Wno-error"
-          endif
-          ifeq ($(shell if test $(GCC_VERSION) -eq 44; then echo true; fi),true)
-            # In GCC 4.4, the CUDA backend's kernel launch templates cause
-            # impossible-to-decipher "'<anonymous>' is used uninitialized in
-            # this function" warnings, so disable uninitialized variable
-            # warnings.
-            CUDACC_FLAGS += -Xcompiler "-Wno-uninitialized"
-          endif
-          ifeq ($(shell if test $(GCC_VERSION) -ge 45; then echo true; fi),true)
-            # This isn't available until GCC 4.3, and misfires on TMP code until
-            # GCC 4.5.
-            CUDACC_FLAGS += -Xcompiler "-Wlogical-op"
+          ifeq ($(shell if test $(GCC_VERSION) -ge 50; then echo true; fi),true)
+            CXX_STD := c++14
+          else
+            CUDACC_FLAGS += -DTHRUST_IGNORE_DEPRECATED_CPP_DIALECT
           endif
+
           ifeq ($(shell if test $(GCC_VERSION) -ge 73; then echo true; fi),true)
             # GCC 7.3 complains about name mangling changes due to `noexcept`
             # becoming part of the type system; we don't care.
             CUDACC_FLAGS += -Xcompiler "-Wno-noexcept-type"
           endif
+          ifeq ($(shell if test $(GCC_VERSION) -ge 80; then echo true; fi),true)
+            # GCC 8.x has a new warning that tries to diagnose technical misuses of
+            # memcpy and memmove. We need to resolve it better than this, but for the
+            # time being, we'll downgrade it from an error to a warning.
+            CUDACC_FLAGS += -Xcompiler "-Wno-error=class-memaccess"
+          endif
         else
           $(error CCBIN is not defined.)
         endif
       endif
     endif
+  else
+    CXX_STD := c++14
   endif
 else ifeq ($(OS),win32)
+  CXX_STD := c++14
+
   # XXX Enable /Wall
   CUDACC_FLAGS += -Xcompiler "/WX"
 
@@ -108,5 +117,44 @@ else ifeq ($(OS),win32)
 
   # Disable warning about applying unary - to unsigned type.
   CUDACC_FLAGS += -Xcompiler "/wd4146"
+
+  # Warning about declspec(allocator) on inappropriate function types
+  CUDACC_FLAGS += -Xcompiler "/wd4494"
+
+  # Allow tests to have lots and lots of sections in each translation unit:
+  CUDACC_FLAGS += -Xcompiler "/bigobj"
 endif
 
+# Promote all NVCC warnings into errors
+CUDACC_FLAGS += -Werror all-warnings
+
+# Print warning numbers with cudafe diagnostics
+CUDACC_FLAGS += -Xcudafe --display_error_number
+
+VERSION_FLAG :=
+ifeq ($(OS),$(filter $(OS),Linux Darwin))
+  ifdef USEPGCXX        # PGI
+    VERSION_FLAG := -V
+  else
+    ifdef USEXLC        # XLC
+      VERSION_FLAG := -qversion
+    else                # GCC, ICC or Clang AKA the sane ones.
+      VERSION_FLAG := --version
+    endif
+  endif
+else ifeq ($(OS),win32) # MSVC
+  # cl.exe run without any options will print its version info and exit.
+  VERSION_FLAG :=
+endif
+
+CCBIN_ENVIRONMENT :=
+ifeq ($(OS), QNX)
+  # QNX's GCC complains if QNX_HOST and QNX_TARGET aren't defined in the
+  # environment.
+  CCBIN_ENVIRONMENT := QNX_HOST=$(QNX_HOST) QNX_TARGET=$(QNX_TARGET)
+endif
+
+$(info #### CCBIN         : $(CCBIN))
+$(info #### CCBIN VERSION : $(shell $(CCBIN_ENVIRONMENT) $(CCBIN) $(VERSION_FLAG)))
+$(info #### CXX_STD       : $(CXX_STD))
+
diff --git a/internal/build/common_detect.mk b/internal/build/common_detect.mk
index df755fe49..e4beb6b88 100644
--- a/internal/build/common_detect.mk
+++ b/internal/build/common_detect.mk
@@ -1,3 +1,5 @@
+CXX_STD = c++11
+
 ifeq ($(THRUST_TEST),1)
   include $(ROOTDIR)/build/getprofile.mk
   include $(ROOTDIR)/build/config/$(PROFILE).mk
diff --git a/internal/build/generic_example.mk b/internal/build/generic_example.mk
index 7441f8665..8fe562245 100644
--- a/internal/build/generic_example.mk
+++ b/internal/build/generic_example.mk
@@ -1,8 +1,6 @@
 # Generic project mk that is included by examples mk
-#  EXAMPLE_NAME  : the name of the example
-#  EXAMPLE_SRC   : path to the source code relative to thrust
-EXECUTABLE         := $(EXAMPLE_NAME)
-BUILD_SRC          := $(ROOTDIR)/thrust/$(EXAMPLE_SRC)
+EXECUTABLE := $(EXAMPLE_NAME)
+BUILD_SRC  := $(ROOTDIR)/thrust/$(EXAMPLE_SRC)
 
 include $(ROOTDIR)/thrust/internal/build/common_detect.mk
 
diff --git a/internal/build/generic_test.mk b/internal/build/generic_test.mk
index 937f903f7..1be548c93 100644
--- a/internal/build/generic_test.mk
+++ b/internal/build/generic_test.mk
@@ -1,8 +1,6 @@
 # Generic project mk that is included by unit tests mk
-#  TEST_NAME  : the name of the test
-#  TEST_SRC   : path to the source code relative to thrust
-EXECUTABLE        := $(TEST_NAME)
-BUILD_SRC         := $(ROOTDIR)/thrust/$(TEST_SRC)
+EXECUTABLE := $(TEST_NAME)
+BUILD_SRC  := $(ROOTDIR)/thrust/$(TEST_SRC)
 
 ifdef VULCAN
   INCLUDES_ABSPATH += $(VULCAN_TOOLKIT_BASE)/thrust/testing
diff --git a/internal/build/warningstester.mk b/internal/build/warningstester.mk
index 7db50f201..f2ceecd8e 100644
--- a/internal/build/warningstester.mk
+++ b/internal/build/warningstester.mk
@@ -1,4 +1,5 @@
 USE_NEW_PROJECT_MK := 1
+
 EXECUTABLE        := warningstester
 PROJ_DIR          := internal/build
 #GENCODE           :=
@@ -23,20 +24,22 @@ endif
 
 CU_FILES += ../test/warningstester.cu
 
-# Thrust includes (thrust/)
+# Thrust includes
 ifdef VULCAN
-INCLUDES += $(VULCAN_INSTALL_DIR)/cuda/include/
+INCLUDES += $(VULCAN_INSTALL_DIR)/cuda/include
 INCLUDES += $(VULCAN_INSTALL_DIR)/cuda/_internal/cudart
+INCLUDES += $(VULCAN_TOOLKIT_BASE)/cub
 else
-INCLUDES += ../../
+INCLUDES += ../..
 INCLUDES += ../../../cuda/tools/cudart
+INCLUDES += ../../../cub
 endif
 
 # Location of generated include file that includes all Thrust public headers
 GENERATED_SOURCES = $(BUILT_CWD)
 CUDACC_FLAGS += -I$(GENERATED_SOURCES)
 
-include $(ROOTDIR)/thrust/internal/build/common_warnings.mk
+include $(ROOTDIR)/thrust/internal/build/common_compiler.mk
 
 ifdef VULCAN_TOOLKIT_BASE
 include $(VULCAN_TOOLKIT_BASE)/build/common.mk
diff --git a/internal/build/warningstester_create_uber_header.py b/internal/build/warningstester_create_uber_header.py
index 29a333063..cef19a43d 100644
--- a/internal/build/warningstester_create_uber_header.py
+++ b/internal/build/warningstester_create_uber_header.py
@@ -46,6 +46,7 @@ def find_headers(base_dir, rel_dir, exclude = ['\B']):
     print('#error no include files found\n')
 
 print('#define THRUST_CPP11_REQUIRED_NO_ERROR')
+print('#define THRUST_CPP14_REQUIRED_NO_ERROR')
 print('#define THRUST_MODERN_GCC_REQUIRED_NO_ERROR')
 for h in headers:
     print('#include <' + h + '>')
diff --git a/internal/scripts/eris_perf.py b/internal/scripts/eris_perf.py
index 7b50a8a85..580471101 100755
--- a/internal/scripts/eris_perf.py
+++ b/internal/scripts/eris_perf.py
@@ -169,6 +169,9 @@ def print_file(p):
 
     for record in reader:
       for variable, directionality in measured_variables:
+        # Don't monitor regressions for STL implementations, nvbug 28980890:
+        if "STL" in variable:
+          continue
         print "&&&& PERF {0}_{1}_{2}bit_{3}mib_{4} {5} {6}{7}".format(
           record["Algorithm"],
           record["Element Type"],
diff --git a/internal/scripts/refresh_from_github2.sh b/internal/scripts/refresh_from_github2.sh
index fb4a2aff1..6b977bcf3 100755
--- a/internal/scripts/refresh_from_github2.sh
+++ b/internal/scripts/refresh_from_github2.sh
@@ -1,4 +1,4 @@
-branch="master"
+branch="main"
 
 while getopts "hb:c:" opt; do
     case $opt in
@@ -37,7 +37,7 @@ set -e
 
 echo "Downloading thrust code from the $branch branch into /tmp/thrust-${branch}"
 rm -rf /tmp/thrust-${branch}
-git clone -q git://github.com/thrust/thrust.git -b ${branch} /tmp/thrust-${branch}
+git clone -q git://github.com/NVIDIA/thrust.git -b ${branch} /tmp/thrust-${branch}
 
 cd `dirname $0`/../..
 echo "Changed current directory to `pwd`"
diff --git a/internal/test/thrust.example.cuda.explicit_cuda_stream.filecheck b/internal/test/thrust.example.cuda.explicit_cuda_stream.filecheck
new file mode 100644
index 000000000..8b81c77d3
--- /dev/null
+++ b/internal/test/thrust.example.cuda.explicit_cuda_stream.filecheck
@@ -0,0 +1 @@
+     CHECK: sum is 499500
diff --git a/internal/test/thrust.example.transform_input_output_iterator.filecheck b/internal/test/thrust.example.transform_input_output_iterator.filecheck
new file mode 100644
index 000000000..caeca2de5
--- /dev/null
+++ b/internal/test/thrust.example.transform_input_output_iterator.filecheck
@@ -0,0 +1,2 @@
+     CHECK: Expected [ 1050 2060 3070 4080 ]
+CHECK-NEXT: Result   [ 1050 2060 3070 4080 ]
diff --git a/internal/test/thrust.sanity.filecheck b/internal/test/thrust.sanity.filecheck
deleted file mode 100644
index 1770bc9f3..000000000
--- a/internal/test/thrust.sanity.filecheck
+++ /dev/null
@@ -1 +0,0 @@
-     CHECK: SANITY
diff --git a/internal/test/thrust.smoke.filecheck b/internal/test/thrust.smoke.filecheck
new file mode 100644
index 000000000..6906f6d86
--- /dev/null
+++ b/internal/test/thrust.smoke.filecheck
@@ -0,0 +1 @@
+     CHECK: SMOKE
diff --git a/internal/test/thrust_nightly.pl b/internal/test/thrust_nightly.pl
index 61e03bda4..ab5815111 100755
--- a/internal/test/thrust_nightly.pl
+++ b/internal/test/thrust_nightly.pl
@@ -182,12 +182,12 @@ sub process_return_code {
 
 my $have_filecheck = 1;
 
-sub filecheck_sanity {
-    my $filecheck_cmd = "$filecheck_path/FileCheck $filecheck_data_path/thrust.sanity.filecheck";
+sub filecheck_smoke_test {
+    my $filecheck_cmd = "$filecheck_path/FileCheck $filecheck_data_path/thrust.smoke.filecheck";
 
     my $filecheck_pid = open(my $filecheck_stdin, "|-", "$filecheck_cmd 2>&1");
 
-    print $filecheck_stdin "SANITY";
+    print $filecheck_stdin "SMOKE";
 
     my $filecheck_ret = 0;
     if (close($filecheck_stdin) == 0)
@@ -196,21 +196,21 @@ sub filecheck_sanity {
     }
 
     if ($filecheck_ret == 0) {
-      printf("#### SANE FileCheck\n");
+      printf("&&&& PASSED FileCheck\n");
     } else {
       # Use a temporary file to send the output to
       # FileCheck so we can get the output this time,
       # because Perl and bidirectional pipes suck.
       my $tmp = File::Temp->new();
       my $tmp_filename = $tmp->filename;
-      print $tmp "SANITY";
+      print $tmp "SMOKE";
 
       printf("********************************************************************************\n");
       print `$filecheck_cmd -input-file $tmp_filename`;
       printf("********************************************************************************\n");
 
-      process_return_code("FileCheck Sanity", $filecheck_ret, "");
-      printf("#### INSANE FileCheck\n");
+      process_return_code("FileCheck Test", $filecheck_ret, "");
+      printf("&&&& FAILED FileCheck\n");
 
       $have_filecheck = 0;
     }
@@ -243,7 +243,7 @@ sub run_cmd {
         {
           $ret = $?;
         }
- 
+
         alarm 0;
     };
     my $elapsed = timestamp() - $start;
@@ -286,7 +286,7 @@ sub run_examples {
     {
         my $test_exe = $test;
 
-        # Ignore FileCheck files. 
+        # Ignore FileCheck files.
         if ($test =~ /[.]filecheck$/)
         {
           next;
@@ -403,7 +403,7 @@ sub run_unit_tests {
     {
         my $test_exe = $test;
 
-        # Ignore FileCheck files. 
+        # Ignore FileCheck files.
         if ($test =~ /[.]filecheck$/)
         {
           next;
@@ -558,6 +558,7 @@ sub dvs_summary {
 
     printf("\n");
 
+    # We can't remove "sanity" here yet because DVS looks for this exact string.
     printf("CUDA DVS BASIC SANITY SCORE : %.1f\n", $dvs_score);
 
     if ($failures + $errors > 0) {
@@ -582,7 +583,7 @@ sub dvs_summary {
 
 printf("\n");
 
-filecheck_sanity();
+filecheck_smoke_test();
 
 printf("\n");
 
diff --git a/scripts/gdb-pretty-printers.py b/scripts/gdb-pretty-printers.py
new file mode 100644
index 000000000..15d790411
--- /dev/null
+++ b/scripts/gdb-pretty-printers.py
@@ -0,0 +1,153 @@
+import gdb
+import sys
+
+if sys.version_info[0] > 2:
+    Iterator = object
+else:
+    # "Polyfill" for Python2 Iterator interface
+    class Iterator:
+        def next(self):
+            return self.__next__()
+
+
+class ThrustVectorPrinter(gdb.printing.PrettyPrinter):
+    "Print a thrust::*_vector"
+
+    class _host_accessible_iterator(Iterator):
+        def __init__(self, start, size):
+            self.item = start
+            self.size = size
+            self.count = 0
+
+        def __iter__(self):
+            return self
+
+        def __next__(self):
+            if self.count >= self.size:
+                raise StopIteration
+            elt = self.item.dereference()
+            count = self.count
+            self.item = self.item + 1
+            self.count = self.count + 1
+            return ('[%d]' % count, elt)
+
+    class _device_iterator(Iterator):
+        def __init__(self, start, size):
+            self.exec = exec
+            self.item = start
+            self.size = size
+            self.count = 0
+            self.buffer = None
+            self.sizeof = self.item.dereference().type.sizeof
+            self.buffer_start = 0
+            # At most 1 MB or size, at least 1
+            self.buffer_size = min(size, max(1, 2 ** 20 // self.sizeof))
+            self.buffer = gdb.parse_and_eval(
+                '(void*)malloc(%s)' % (self.buffer_size * self.sizeof))
+            self.buffer.fetch_lazy()
+            self.buffer_count = self.buffer_size
+            self.update_buffer()
+
+        def update_buffer(self):
+            if self.buffer_count >= self.buffer_size:
+                self.buffer_item = gdb.parse_and_eval(
+                    hex(self.buffer)).cast(self.item.type)
+                self.buffer_count = 0
+                self.buffer_start = self.count
+                device_addr = hex(self.item.dereference().address)
+                buffer_addr = hex(self.buffer)
+                size = min(self.buffer_size, self.size -
+                           self.buffer_start) * self.sizeof
+                status = gdb.parse_and_eval(
+                    '(cudaError)cudaMemcpy(%s, %s, %d, cudaMemcpyDeviceToHost)' % (buffer_addr, device_addr, size))
+                if status != 0:
+                    raise gdb.MemoryError(
+                        'memcpy from device failed: %s' % status)
+
+        def __del__(self):
+            gdb.parse_and_eval('(void)free(%s)' %
+                               hex(self.buffer)).fetch_lazy()
+
+        def __iter__(self):
+            return self
+
+        def __next__(self):
+            if self.count >= self.size:
+                raise StopIteration
+            self.update_buffer()
+            elt = self.buffer_item.dereference()
+            self.buffer_item = self.buffer_item + 1
+            self.buffer_count = self.buffer_count + 1
+            count = self.count
+            self.item = self.item + 1
+            self.count = self.count + 1
+            return ('[%d]' % count, elt)
+
+    def __init__(self, val):
+        self.val = val
+        self.pointer = val['m_storage']['m_begin']['m_iterator']
+        self.size = int(val['m_size'])
+        self.capacity = int(val['m_storage']['m_size'])
+        self.is_device = False
+        if str(self.pointer.type).startswith("thrust::device_ptr"):
+            self.pointer = self.pointer['m_iterator']
+            self.is_device = True
+
+    def children(self):
+        if self.is_device:
+            return self._device_iterator(self.pointer, self.size)
+        else:
+            return self._host_accessible_iterator(self.pointer, self.size)
+
+    def to_string(self):
+        typename = str(self.val.type)
+        return ('%s of length %d, capacity %d' % (typename, self.size, self.capacity))
+
+    def display_hint(self):
+        return 'array'
+
+
+class ThrustReferencePrinter(gdb.printing.PrettyPrinter):
+    "Print a thrust::device_reference"
+
+    def __init__(self, val):
+        self.val = val
+        self.pointer = val['ptr']['m_iterator']
+        self.type = self.pointer.dereference().type
+        sizeof = self.type.sizeof
+        self.buffer = gdb.parse_and_eval('(void*)malloc(%s)' % sizeof)
+        device_addr = hex(self.pointer)
+        buffer_addr = hex(self.buffer)
+        status = gdb.parse_and_eval('(cudaError)cudaMemcpy(%s, %s, %d, cudaMemcpyDeviceToHost)' % (
+            buffer_addr, device_addr, sizeof))
+        if status != 0:
+            raise gdb.MemoryError('memcpy from device failed: %s' % status)
+        self.buffer_val = gdb.parse_and_eval(
+            hex(self.buffer)).cast(self.pointer.type).dereference()
+
+    def __del__(self):
+        gdb.parse_and_eval('(void)free(%s)' % hex(self.buffer)).fetch_lazy()
+
+    def children(self):
+        return []
+
+    def to_string(self):
+        typename = str(self.val.type)
+        return ('(%s) @%s: %s' % (typename, self.pointer, self.buffer_val))
+
+    def display_hint(self):
+        return None
+
+
+def lookup_thrust_type(val):
+    if not str(val.type.unqualified()).startswith('thrust::'):
+        return None
+    suffix = str(val.type.unqualified())[8:]
+    if suffix.startswith('host_vector') or suffix.startswith('device_vector'):
+        return ThrustVectorPrinter(val)
+    elif int(gdb.VERSION.split(".")[0]) >= 10 and suffix.startswith('device_reference'):
+        return ThrustReferencePrinter(val)
+    return None
+
+
+gdb.pretty_printers.append(lookup_thrust_type)
diff --git a/testing/CMakeLists.txt b/testing/CMakeLists.txt
new file mode 100644
index 000000000..0f0749c4e
--- /dev/null
+++ b/testing/CMakeLists.txt
@@ -0,0 +1,169 @@
+# Create meta targets that build all tests for a single configuration:
+foreach(thrust_target IN LISTS THRUST_TARGETS)
+  thrust_get_target_property(config_prefix ${thrust_target} PREFIX)
+  set(config_meta_target ${config_prefix}.tests)
+  add_custom_target(${config_meta_target})
+  add_dependencies(${config_prefix}.all ${config_meta_target})
+endforeach()
+
+# Update flags to reflect RDC options. See note in ThrustCudaConfig.cmake --
+# these flag variables behave unintuitively:
+if (THRUST_ENABLE_TESTS_WITH_RDC)
+  set(CMAKE_CUDA_FLAGS "${THRUST_CUDA_FLAGS_BASE} ${THRUST_CUDA_FLAGS_RDC}")
+else()
+  set(CMAKE_CUDA_FLAGS "${THRUST_CUDA_FLAGS_BASE} ${THRUST_CUDA_FLAGS_NO_RDC}")
+endif()
+
+# Generate testing framework libraries:
+add_subdirectory(unittest)
+
+# Some tests only support certain host.device configurations. Use this macro to
+# declare allowed configurations. If not specified, all host.device config are
+# used.
+set(restricted_tests)
+macro(thrust_declare_test_restrictions test_name)
+  list(APPEND restricted_tests ${test_name})
+  list(APPEND ${test_name}_host.device_allowed ${ARGN})
+endmacro()
+
+# Async/future/event tests only support the CUDA backend:
+thrust_declare_test_restrictions(async_copy        CPP.CUDA OMP.CUDA TBB.CUDA)
+thrust_declare_test_restrictions(async_for_each    CPP.CUDA OMP.CUDA TBB.CUDA)
+thrust_declare_test_restrictions(async_reduce      CPP.CUDA OMP.CUDA TBB.CUDA)
+thrust_declare_test_restrictions(async_reduce_into CPP.CUDA OMP.CUDA TBB.CUDA)
+thrust_declare_test_restrictions(async_sort        CPP.CUDA OMP.CUDA TBB.CUDA)
+thrust_declare_test_restrictions(async_transform   CPP.CUDA OMP.CUDA TBB.CUDA)
+thrust_declare_test_restrictions(event             CPP.CUDA OMP.CUDA TBB.CUDA)
+thrust_declare_test_restrictions(future            CPP.CUDA OMP.CUDA TBB.CUDA)
+
+# This test is incompatible with TBB and OMP, since it requires special per-device
+# handling to process exceptions in a device function, which is only implemented
+# for CUDA.
+thrust_declare_test_restrictions(unittest_static_assert CPP.CPP CPP.CUDA)
+
+# In the TBB backend, reduce_by_key does not currently work with transform_output_iterator
+# https://github.com/NVIDIA/thrust/issues/1811
+thrust_declare_test_restrictions(transform_output_iterator_reduce_by_key CPP.CPP CPP.OMP CPP.CUDA)
+
+## thrust_add_test
+#
+# Add a test executable and register it with ctest.
+#
+# target_name_var: Variable name to overwrite with the name of the test
+#   target. Useful for post-processing target information per-backend.
+# test_name: The name of the test minus "<config_prefix>.test." For example,
+#   testing/vector.cu will be "vector", and testing/cuda/copy.cu will be
+#   "cuda.copy".
+# test_src: The source file that implements the test.
+# thrust_target: The reference thrust target with configuration information.
+#
+function(thrust_add_test target_name_var test_name test_src thrust_target)
+  thrust_get_target_property(config_host ${thrust_target} HOST)
+  thrust_get_target_property(config_device ${thrust_target} DEVICE)
+  thrust_get_target_property(config_prefix ${thrust_target} PREFIX)
+
+  # Wrap the .cu file in .cpp for non-CUDA backends
+  if ("CUDA" STREQUAL "${config_device}")
+    set(real_test_src "${test_src}")
+  else()
+    thrust_wrap_cu_in_cpp(real_test_src "${test_src}" ${thrust_target})
+  endif()
+
+  # The actual name of the test's target:
+  set(test_target ${config_prefix}.test.${test_name})
+  set(${target_name_var} ${test_target} PARENT_SCOPE)
+
+  # Related target names:
+  set(config_framework_target ${config_prefix}.test.framework)
+  set(config_meta_target ${config_prefix}.tests)
+  set(test_meta_target thrust.all.test.${test_name})
+
+  add_executable(${test_target} "${real_test_src}")
+  target_link_libraries(${test_target} PRIVATE ${config_framework_target})
+  target_include_directories(${test_target} PRIVATE "${Thrust_SOURCE_DIR}/testing")
+  thrust_clone_target_properties(${test_target} ${thrust_target})
+
+  if (NOT "Clang" STREQUAL "${CMAKE_CUDA_COMPILER_ID}")
+    target_compile_definitions(${test_target} PRIVATE THRUST_TEST_DEVICE_SIDE)
+  endif()
+
+  thrust_fix_clang_nvcc_build_for(${test_target})
+
+  # Add to the active configuration's meta target
+  add_dependencies(${config_meta_target} ${test_target})
+
+  # Meta target that builds tests with this name for all configurations:
+  if (NOT TARGET ${test_meta_target})
+    add_custom_target(${test_meta_target})
+  endif()
+  add_dependencies(${test_meta_target} ${test_target})
+
+  add_test(NAME ${test_target}
+    COMMAND "${CMAKE_COMMAND}"
+    "-DTHRUST_BINARY=$<TARGET_FILE:${test_target}>"
+    "-DTHRUST_SOURCE=${Thrust_SOURCE_DIR}"
+    -P "${Thrust_SOURCE_DIR}/cmake/ThrustRunTest.cmake"
+  )
+
+  # Run OMP/TBB tests in serial. Multiple OMP processes will massively
+  # oversubscribe the machine with GCC's OMP, and we want to test these with
+  # the full CPU available to each unit test.
+  set(config_systems ${config_host} ${config_device})
+  if (("OMP" IN_LIST config_systems) OR ("TBB" IN_LIST config_systems))
+    set_tests_properties(${test_target} PROPERTIES RUN_SERIAL ON)
+  endif()
+
+  # Check for per-test script. Script will be included in the current scope
+  # to allow custom property modifications.
+  get_filename_component(test_cmake_script "${test_src}" NAME_WLE)
+  set(test_cmake_script "${CMAKE_CURRENT_LIST_DIR}/${test_cmake_script}.cmake")
+  # Use a glob so we can detect if this changes:
+  file(GLOB test_cmake_script
+    RELATIVE "${CMAKE_CURRENT_LIST_DIR}"
+    CONFIGURE_DEPENDS
+    "${test_cmake_script}"
+  )
+  if (test_cmake_script) # Will be non-empty only if the script exists
+    include("${test_cmake_script}")
+  endif()
+endfunction()
+
+file(GLOB test_srcs
+  RELATIVE "${CMAKE_CURRENT_LIST_DIR}"
+  CONFIGURE_DEPENDS
+  *.cu *.cpp
+)
+
+# Add common tests to all configs:
+foreach(thrust_target IN LISTS THRUST_TARGETS)
+  thrust_get_target_property(config_host ${thrust_target} HOST)
+  thrust_get_target_property(config_device ${thrust_target} DEVICE)
+  thrust_get_target_property(config_prefix ${thrust_target} PREFIX)
+
+  foreach(test_src IN LISTS test_srcs)
+    get_filename_component(test_name "${test_src}" NAME_WLE)
+
+    # Is this test restricted to only certain host/device combinations?
+    if(${test_name} IN_LIST restricted_tests)
+      # Is the current host/device combination supported?
+      if (NOT "${config_host}.${config_device}" IN_LIST
+            ${test_name}_host.device_allowed)
+        continue()
+      endif()
+    endif()
+
+    thrust_add_test(test_target ${test_name} "${test_src}" ${thrust_target})
+
+    if (THRUST_ENABLE_TESTS_WITH_RDC AND ("CUDA" STREQUAL "${config_device}"))
+      thrust_enable_rdc_for_cuda_target(${test_target})
+    endif()
+  endforeach()
+endforeach()
+
+# Add specialized tests:
+add_subdirectory(async)
+add_subdirectory(cmake)
+add_subdirectory(cpp)
+add_subdirectory(cuda)
+add_subdirectory(omp)
+add_subdirectory(regression)
diff --git a/testing/adjacent_difference.cu b/testing/adjacent_difference.cu
index 8e5cd3ff8..5f97ea350 100644
--- a/testing/adjacent_difference.cu
+++ b/testing/adjacent_difference.cu
@@ -2,6 +2,8 @@
 #include <thrust/adjacent_difference.h>
 #include <thrust/iterator/discard_iterator.h>
 #include <thrust/iterator/retag.h>
+#include <thrust/device_malloc.h>
+#include <thrust/device_free.h>
 
 template <class Vector>
 void TestAdjacentDifferenceSimple(void)
@@ -13,21 +15,21 @@ void TestAdjacentDifferenceSimple(void)
     input[0] = 1; input[1] = 4; input[2] = 6;
 
     typename Vector::iterator result;
-    
+
     result = thrust::adjacent_difference(input.begin(), input.end(), output.begin());
 
     ASSERT_EQUAL(result - output.begin(), 3);
     ASSERT_EQUAL(output[0], T(1));
     ASSERT_EQUAL(output[1], T(3));
     ASSERT_EQUAL(output[2], T(2));
-    
+
     result = thrust::adjacent_difference(input.begin(), input.end(), output.begin(), thrust::plus<T>());
-    
+
     ASSERT_EQUAL(result - output.begin(), 3);
     ASSERT_EQUAL(output[0], T( 1));
     ASSERT_EQUAL(output[1], T( 5));
     ASSERT_EQUAL(output[2], T(10));
-    
+
     // test in-place operation, result and first are permitted to be the same
     result = thrust::adjacent_difference(input.begin(), input.end(), input.begin());
 
@@ -57,14 +59,14 @@ void TestAdjacentDifference(const size_t n)
     ASSERT_EQUAL(std::size_t(h_result - h_output.begin()), n);
     ASSERT_EQUAL(std::size_t(d_result - d_output.begin()), n);
     ASSERT_EQUAL(h_output, d_output);
-    
+
     h_result = thrust::adjacent_difference(h_input.begin(), h_input.end(), h_output.begin(), thrust::plus<T>());
     d_result = thrust::adjacent_difference(d_input.begin(), d_input.end(), d_output.begin(), thrust::plus<T>());
 
     ASSERT_EQUAL(std::size_t(h_result - h_output.begin()), n);
     ASSERT_EQUAL(std::size_t(d_result - d_output.begin()), n);
     ASSERT_EQUAL(h_output, d_output);
-    
+
     // in-place operation
     h_result = thrust::adjacent_difference(h_input.begin(), h_input.end(), h_input.begin(), thrust::plus<T>());
     d_result = thrust::adjacent_difference(d_input.begin(), d_input.end(), d_input.begin(), thrust::plus<T>());
@@ -90,7 +92,7 @@ void TestAdjacentDifferenceInPlaceWithRelatedIteratorTypes(const size_t n)
 
     h_result = thrust::adjacent_difference(h_input.begin(), h_input.end(), h_output.begin(), thrust::plus<T>());
     d_result = thrust::adjacent_difference(d_input.begin(), d_input.end(), d_output.begin(), thrust::plus<T>());
-    
+
     // in-place operation with different iterator types
     h_result = thrust::adjacent_difference(h_input.cbegin(), h_input.cend(), h_input.begin(), thrust::plus<T>());
     d_result = thrust::adjacent_difference(d_input.cbegin(), d_input.cend(), d_input.begin(), thrust::plus<T>());
@@ -159,4 +161,3 @@ void TestAdjacentDifferenceDispatchImplicit()
     ASSERT_EQUAL(13, d_input.front());
 }
 DECLARE_UNITTEST(TestAdjacentDifferenceDispatchImplicit);
-
diff --git a/testing/alignment.cu b/testing/alignment.cu
index 6ddf1c73c..e55df2e96 100644
--- a/testing/alignment.cu
+++ b/testing/alignment.cu
@@ -210,7 +210,7 @@ void test_aligned_type()
 DECLARE_UNITTEST(test_aligned_type);
 
 template <std::size_t Len, std::size_t Align>
-void test_aligned_storage_instantiation()
+void test_aligned_storage_instantiation(thrust::detail::true_type /* Align is valid */)
 {
     typedef typename thrust::detail::aligned_storage<Len, Align>::type type;
     ASSERT_GEQUAL(sizeof(type), Len);
@@ -218,6 +218,21 @@ void test_aligned_storage_instantiation()
     ASSERT_EQUAL(thrust::detail::alignment_of<type>::value, Align);
 }
 
+template <std::size_t Len, std::size_t Align>
+void test_aligned_storage_instantiation(thrust::detail::false_type /* Align is invalid */)
+{
+  // no-op -- alignment is > max_align_t and MSVC complains loudly.
+}
+
+template <std::size_t Len, std::size_t Align>
+void test_aligned_storage_instantiation()
+{
+  typedef thrust::detail::integral_constant<
+      bool, Align <= THRUST_ALIGNOF(thrust::detail::max_align_t)>
+      ValidAlign;
+  test_aligned_storage_instantiation<Len, Align>(ValidAlign());
+}
+
 template <std::size_t Len>
 void test_aligned_storage_size()
 {
diff --git a/testing/allocator.cu b/testing/allocator.cu
index edc6f0d52..175685ed0 100644
--- a/testing/allocator.cu
+++ b/testing/allocator.cu
@@ -1,6 +1,10 @@
 #include <unittest/unittest.h>
+#include <thrust/detail/config.h>
 #include <thrust/device_malloc_allocator.h>
 #include <thrust/system/cpp/vector.h>
+
+#include <nv/target>
+
 #include <memory>
 
 template <typename T>
@@ -59,9 +63,12 @@ DECLARE_VARIABLE_UNITTEST(TestAllocatorCustomCopyConstruct);
 template <typename T>
 struct my_allocator_with_custom_destroy
 {
-  typedef T         value_type;
-  typedef T &       reference;
-  typedef const T & const_reference;
+  // This is only used with thrust::cpp::vector:
+  using system_type = thrust::cpp::tag;
+
+  using value_type = T;
+  using reference = T &;
+  using const_reference = const T &;
 
   static bool g_state;
 
@@ -79,9 +86,7 @@ struct my_allocator_with_custom_destroy
   __host__ __device__
   void destroy(T *)
   {
-#if !__CUDA_ARCH__
-    g_state = true;
-#endif
+    NV_IF_TARGET(NV_IS_HOST, (g_state = true;));
   }
 
   value_type *allocate(std::ptrdiff_t n)
@@ -118,12 +123,14 @@ bool my_allocator_with_custom_destroy<T>::g_state = false;
 template <typename T>
 void TestAllocatorCustomDestroy(size_t n)
 {
+  my_allocator_with_custom_destroy<T>::g_state = false;
+
   {
     thrust::cpp::vector<T, my_allocator_with_custom_destroy<T> > vec(n);
   } // destroy everything
 
-  if (0 < n)
-    ASSERT_EQUAL(true, my_allocator_with_custom_destroy<T>::g_state);
+  // state should only be true when there are values to destroy:
+  ASSERT_EQUAL(n > 0, my_allocator_with_custom_destroy<T>::g_state);
 }
 DECLARE_VARIABLE_UNITTEST(TestAllocatorCustomDestroy);
 
@@ -202,7 +209,6 @@ void TestAllocatorTraitsRebind()
 }
 DECLARE_UNITTEST(TestAllocatorTraitsRebind);
 
-#if __cplusplus >= 201103L
 void TestAllocatorTraitsRebindCpp11()
 {
   ASSERT_EQUAL(
@@ -250,5 +256,3 @@ void TestAllocatorTraitsRebindCpp11()
   );
 }
 DECLARE_UNITTEST(TestAllocatorTraitsRebindCpp11);
-#endif
-
diff --git a/testing/allocator_aware_policies.cu b/testing/allocator_aware_policies.cu
index a1b7b911a..0a737c3ce 100644
--- a/testing/allocator_aware_policies.cu
+++ b/testing/allocator_aware_policies.cu
@@ -17,15 +17,16 @@ struct test_allocator_t
 test_allocator_t<int> test_allocator = test_allocator_t<int>();
 const test_allocator_t<int> const_test_allocator = test_allocator_t<int>();
 
-struct test_memory_resource_t THRUST_FINAL : thrust::mr::memory_resource<>
+struct test_memory_resource_t final : thrust::mr::memory_resource<>
 {
-    void * do_allocate(std::size_t, std::size_t) THRUST_OVERRIDE
+    void * do_allocate(std::size_t size, std::size_t) override
     {
-        return NULL;
+        return reinterpret_cast<void *>(size);
     }
 
-    void do_deallocate(void *, std::size_t, std::size_t) THRUST_OVERRIDE
+    void do_deallocate(void * ptr, std::size_t size, std::size_t) override
     {
+        ASSERT_EQUAL(ptr, reinterpret_cast<void *>(size));
     }
 } test_memory_resource;
 
@@ -83,7 +84,8 @@ struct TestAllocatorAttachment
             get_temporary_buffer<int>(
                 policy,
                 123
-            ).first
+            ).first,
+            123
         );
     }
 
@@ -106,8 +108,9 @@ struct TestAllocatorAttachment
         test_temporary_allocation_valid(policy(std::allocator<int>()));
         test_temporary_allocation_valid(policy(alloc));
         test_temporary_allocation_valid(policy(const_alloc));
+        test_temporary_allocation_valid(policy(&test_memory_resource));
 
-        #if THRUST_CPP_DIALECT >= 2011 
+        #if THRUST_CPP_DIALECT >= 2011
         test_temporary_allocation_valid(policy(std::allocator<int>()).after(1));
         test_temporary_allocation_valid(policy(alloc).after(1));
         test_temporary_allocation_valid(policy(const_alloc).after(1));
diff --git a/testing/async/CMakeLists.txt b/testing/async/CMakeLists.txt
new file mode 100644
index 000000000..00d50f097
--- /dev/null
+++ b/testing/async/CMakeLists.txt
@@ -0,0 +1,80 @@
+# The async tests perform a large amount of codegen, making them expensive to
+# build and test. To keep compilation and runtimes manageable, the tests are
+# broken up into many files per algorithm to enable parallelism during
+# compilation and testing. The structure of these test directories are:
+#
+# thrust/testing/async/<algorithm_name>/<unit_test>.cu
+#
+# These generate executables and CTest tests named
+# ${config_prefix}.test.async.<algorithm_name>.<unit_test>.
+
+# The async tests only support CUDA enabled configs. Create a list of valid
+# thrust targets:
+set(cuda_configs)
+foreach(thrust_target IN LISTS THRUST_TARGETS)
+  thrust_get_target_property(config_device ${thrust_target} DEVICE)
+  if (config_device STREQUAL CUDA)
+    list(APPEND cuda_configs ${thrust_target})
+  endif()
+endforeach()
+
+list(LENGTH cuda_configs num_cuda_configs)
+if (num_cuda_configs EQUAL 0)
+  return() # No valid configs found, nothing to do.
+endif()
+
+# Process a single algorithm directory, adding all .cu/cpp files as tests for
+# each valid backend. algo_name is the name of the subdir (<algorithm_name>
+# above) and is used for naming the executable/targets.
+function(thrust_add_async_test_dir algo_name)
+  file(GLOB test_srcs
+    RELATIVE "${CMAKE_CURRENT_LIST_DIR}"
+    CONFIGURE_DEPENDS
+    "${algo_name}/*.cu"
+    "${algo_name}/*.cpp"
+  )
+
+  # Per-algorithm, all-config metatarget: thrust.all.test.async.[algo].all
+  set(algo_meta_target thrust.all.test.async.${algo_name}.all)
+  add_custom_target(${algo_meta_target})
+
+  foreach(thrust_target IN LISTS cuda_configs)
+    thrust_get_target_property(config_prefix ${thrust_target} PREFIX)
+
+    # Per-algorithm, per-config metatarget: thrust.[config].test.async.[algo].all
+    set(algo_config_meta_target ${config_prefix}.test.async.${algo_name}.all)
+    add_custom_target(${algo_config_meta_target})
+    add_dependencies(${algo_meta_target} ${algo_config_meta_target})
+
+    foreach(test_src IN LISTS test_srcs)
+      get_filename_component(test_name "${test_src}" NAME_WLE)
+      string(PREPEND test_name async.${algo_name}.)
+
+      thrust_add_test(test_target ${test_name} "${test_src}" ${thrust_target})
+      if(THRUST_ENABLE_TESTS_WITH_RDC)
+        thrust_enable_rdc_for_cuda_target(${test_target})
+      endif()
+
+      add_dependencies(${algo_config_meta_target} ${test_target})
+    endforeach()
+  endforeach()
+endfunction()
+
+# Grab all algorithm subdirectories:
+set(test_dirs)
+file(GLOB contents
+  CONFIGURE_DEPENDS
+  "${CMAKE_CURRENT_LIST_DIR}/*"
+)
+
+foreach(test_dir IN LISTS contents)
+  if(IS_DIRECTORY "${test_dir}")
+    list(APPEND test_dirs "${test_dir}")
+  endif()
+endforeach()
+
+# Process all test dirs:
+foreach(test_dir IN LISTS test_dirs)
+  get_filename_component(algo_name "${test_dir}" NAME_WLE)
+  thrust_add_async_test_dir(${algo_name})
+endforeach()
diff --git a/testing/async/exclusive_scan/counting_iterator.cu b/testing/async/exclusive_scan/counting_iterator.cu
new file mode 100644
index 000000000..7771299dd
--- /dev/null
+++ b/testing/async/exclusive_scan/counting_iterator.cu
@@ -0,0 +1,46 @@
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <async/test_policy_overloads.h>
+
+#include <async/exclusive_scan/mixin.h>
+
+#include <algorithm>
+#include <limits>
+
+template <typename input_value_type,
+          typename output_value_type   = input_value_type,
+          typename initial_value_type  = input_value_type,
+          typename alternate_binary_op = thrust::maximum<>>
+struct invoker
+    : testing::async::mixin::input::counting_iterator_from_0<input_value_type>
+    , testing::async::mixin::output::device_vector<output_value_type>
+    , testing::async::exclusive_scan::mixin::postfix_args::
+        all_overloads<initial_value_type, alternate_binary_op>
+    , testing::async::exclusive_scan::mixin::invoke_reference::
+        host_synchronous<input_value_type, output_value_type>
+    , testing::async::exclusive_scan::mixin::invoke_async::simple
+    , testing::async::mixin::compare_outputs::assert_almost_equal_if_fp_quiet
+{
+  static std::string description()
+  {
+    return "fancy input iterator (counting_iterator)";
+  }
+};
+
+template <typename T>
+struct test_counting_iterator
+{
+  void operator()(std::size_t num_values) const
+  {
+    num_values = unittest::truncate_to_max_representable<T>(num_values);
+    testing::async::test_policy_overloads<invoker<T>>::run(num_values);
+  }
+};
+// Use built-in types only, counting_iterator doesn't seem to be compatible with
+// the custom_numeric.
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(test_counting_iterator,
+                                          BuiltinNumericTypes);
+
+#endif // C++14
diff --git a/testing/async/exclusive_scan/discard_output.cu b/testing/async/exclusive_scan/discard_output.cu
new file mode 100644
index 000000000..ec7ca5f47
--- /dev/null
+++ b/testing/async/exclusive_scan/discard_output.cu
@@ -0,0 +1,38 @@
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <async/test_policy_overloads.h>
+
+#include <async/exclusive_scan/mixin.h>
+
+// Compilation test with discard iterators. No runtime validation is actually
+// performed, other than testing whether the algorithm completes without
+// exception.
+
+template <typename input_value_type,
+          typename initial_value_type  = input_value_type,
+          typename alternate_binary_op = thrust::maximum<>>
+struct discard_invoker
+    : testing::async::mixin::input::device_vector<input_value_type>
+    , testing::async::mixin::output::discard_iterator
+    , testing::async::exclusive_scan::mixin::postfix_args::
+        all_overloads<initial_value_type, alternate_binary_op>
+    , testing::async::mixin::invoke_reference::noop
+    , testing::async::exclusive_scan::mixin::invoke_async::simple
+    , testing::async::mixin::compare_outputs::noop
+{
+  static std::string description() { return "discard output"; }
+};
+
+template <typename T>
+struct test_discard
+{
+  void operator()(std::size_t num_values) const
+  {
+    testing::async::test_policy_overloads<discard_invoker<T>>::run(num_values);
+  }
+};
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(test_discard, NumericTypes);
+
+#endif // C++14
diff --git a/testing/async/exclusive_scan/large_indices.cu b/testing/async/exclusive_scan/large_indices.cu
new file mode 100644
index 000000000..4d1c51df0
--- /dev/null
+++ b/testing/async/exclusive_scan/large_indices.cu
@@ -0,0 +1,244 @@
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <async/test_policy_overloads.h>
+
+#include <async/exclusive_scan/mixin.h>
+
+#include <thrust/device_free.h>
+#include <thrust/device_malloc.h>
+#include <thrust/device_ptr.h>
+#include <thrust/optional.h>
+
+#include <thrust/iterator/detail/device_system_tag.h>
+#include <thrust/iterator/detail/iterator_facade_category.h>
+
+#include <cstdint>
+
+// This test is an adaptation of TestInclusiveScanWithBigIndices from scan.cu.
+
+namespace
+{
+
+// Fake iterator that asserts
+// (a) it is written with a sequence and
+// (b) a defined maximum value is written at some point
+//
+// This allows us to test very large problem sizes without actually allocating
+// large amounts of memory that would exceed most devices' capacity.
+struct assert_sequence_iterator
+{
+  using value_type      = std::int64_t;
+  using difference_type = std::int64_t;
+
+  // Defined for thrust::iterator_traits:
+  using pointer           = value_type*;
+  using reference         = assert_sequence_iterator; // weird but convenient
+  using iterator_category =
+    typename thrust::detail::iterator_facade_category<
+      thrust::device_system_tag,
+      thrust::random_access_traversal_tag,
+      value_type,
+      reference>::type;
+
+  std::int64_t expected{0};
+  std::int64_t max{0};
+  mutable thrust::device_ptr<bool> found_max{nullptr};
+  mutable thrust::device_ptr<bool> unexpected_value{nullptr};
+
+  // Should be called on the first iterator generated. This needs to be
+  // done explicitly from the host.
+  void initialize_shared_state()
+  {
+    found_max        = thrust::device_malloc<bool>(1);
+    unexpected_value = thrust::device_malloc<bool>(1);
+    *found_max        = false;
+    *unexpected_value = false;
+  }
+
+  // Should be called only once on the initialized iterator. This needs to be
+  // done explicitly from the host.
+  void free_shared_state() const
+  {
+    thrust::device_free(found_max);
+    thrust::device_free(unexpected_value);
+    found_max        = nullptr;
+    unexpected_value = nullptr;
+  }
+
+  __host__ __device__ assert_sequence_iterator operator+(difference_type i) const
+  {
+    return clone(expected + i);
+  }
+
+  __host__ __device__ reference operator[](difference_type i) const
+  {
+    return clone(expected + i);
+  }
+
+  // Some weirdness, this iterator acts like its own reference
+  __device__ assert_sequence_iterator operator=(value_type val)
+  {
+    if (val != expected)
+    {
+      printf("Error: expected %lld, got %lld\n", expected, val);
+      *unexpected_value = true;
+    }
+    else if (val == max)
+    {
+      *found_max = true;
+    }
+
+    return *this;
+  }
+
+private:
+  __host__ __device__
+  assert_sequence_iterator clone(value_type new_expected) const
+  {
+    return {new_expected, max, found_max, unexpected_value};
+  }
+};
+
+// output mixin that generates assert_sequence_iterators.
+// Must be paired with validate_assert_sequence_iterators mixin to free
+// shared state.
+struct assert_sequence_output
+{
+  struct output_type
+  {
+    using iterator = assert_sequence_iterator;
+
+    iterator iter;
+
+    explicit output_type(iterator&& it)
+        : iter{std::move(it)}
+    {
+      iter.initialize_shared_state();
+    }
+
+    ~output_type()
+    {
+      iter.free_shared_state();
+    }
+
+    iterator begin() { return iter; }
+  };
+
+  template <typename InputType>
+  static output_type generate_output(std::size_t num_values, InputType&)
+  {
+    using value_type = typename assert_sequence_iterator::value_type;
+    assert_sequence_iterator it{0,
+                                // minus one bc exclusive scan:
+                                static_cast<value_type>(num_values - 1),
+                                nullptr,
+                                nullptr};
+    return output_type{std::move(it)};
+  }
+};
+
+struct validate_assert_sequence_iterators
+{
+  using output_t = assert_sequence_output::output_type;
+
+  template <typename EventType>
+  static void compare_outputs(EventType& e,
+                              output_t const&,
+                              output_t const& test)
+  {
+    testing::async::mixin::compare_outputs::detail::basic_event_validation(e);
+
+    ASSERT_EQUAL(*test.iter.unexpected_value, false);
+    ASSERT_EQUAL(*test.iter.found_max, true);
+  }
+};
+
+//------------------------------------------------------------------------------
+// Overloads without custom binary operators use thrust::plus<>, so use
+// constant input iterator to generate the output sequence:
+struct default_bin_op_overloads
+{
+  using postfix_args_type = std::tuple< // List any extra arg overloads:
+    std::tuple<>,                       // - no extra args
+    std::tuple<uint64_t>                // - initial_value
+    >;
+
+  static postfix_args_type generate_postfix_args()
+  {
+    return postfix_args_type{std::tuple<>{}, std::tuple<uint64_t>{0}};
+  }
+};
+
+struct default_bin_op_invoker
+    : testing::async::mixin::input::constant_iterator_1<std::int64_t>
+    , assert_sequence_output
+    , default_bin_op_overloads
+    , testing::async::mixin::invoke_reference::noop
+    , testing::async::exclusive_scan::mixin::invoke_async::simple
+    , validate_assert_sequence_iterators
+{
+  static std::string description()
+  {
+    return "test large array indices with default binary operator";
+  }
+};
+
+} // anon namespace
+
+void test_large_indices_default_scan_op()
+{
+  // Test problem sizes around signed/unsigned int max:
+  testing::async::test_policy_overloads<default_bin_op_invoker>::run(1ll << 30);
+  testing::async::test_policy_overloads<default_bin_op_invoker>::run(1ll << 31);
+  testing::async::test_policy_overloads<default_bin_op_invoker>::run(1ll << 32);
+  testing::async::test_policy_overloads<default_bin_op_invoker>::run(1ll << 33);
+}
+DECLARE_UNITTEST(test_large_indices_default_scan_op);
+
+namespace
+{
+
+//------------------------------------------------------------------------------
+// Generate the output sequence using counting iterators and thrust::max<> for
+// custom operator overloads.
+struct custom_bin_op_overloads
+{
+  using postfix_args_type = std::tuple<     // List any extra arg overloads:
+    std::tuple<uint64_t, thrust::maximum<>> // - initial_value, binop
+  >;
+
+  static postfix_args_type generate_postfix_args()
+  {
+    return postfix_args_type{std::make_tuple(0, thrust::maximum<>{})};
+  }
+};
+
+struct custom_bin_op_invoker
+  : testing::async::mixin::input::counting_iterator_from_1<std::int64_t>
+    , assert_sequence_output
+    , custom_bin_op_overloads
+    , testing::async::mixin::invoke_reference::noop
+    , testing::async::exclusive_scan::mixin::invoke_async::simple
+    , validate_assert_sequence_iterators
+{
+  static std::string description()
+  {
+    return "test large array indices with custom binary operator";
+  }
+};
+
+} // namespace
+
+void test_large_indices_custom_scan_op()
+{
+  // Test problem sizes around signed/unsigned int max:
+  testing::async::test_policy_overloads<custom_bin_op_invoker>::run(1ll << 30);
+  testing::async::test_policy_overloads<custom_bin_op_invoker>::run(1ll << 31);
+  testing::async::test_policy_overloads<custom_bin_op_invoker>::run(1ll << 32);
+  testing::async::test_policy_overloads<custom_bin_op_invoker>::run(1ll << 33);
+}
+DECLARE_UNITTEST(test_large_indices_custom_scan_op);
+
+#endif // C++14
diff --git a/testing/async/exclusive_scan/large_types.cu b/testing/async/exclusive_scan/large_types.cu
new file mode 100644
index 000000000..571d39262
--- /dev/null
+++ b/testing/async/exclusive_scan/large_types.cu
@@ -0,0 +1,58 @@
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <async/test_policy_overloads.h>
+
+#include <async/exclusive_scan/mixin.h>
+
+#include <unittest/special_types.h>
+
+// This test is an adaptation of TestScanWithLargeTypes from scan.cu.
+
+// Need special initialization for the FixedVector type:
+template <typename value_type>
+struct device_vector_fill
+{
+  using input_type = thrust::device_vector<value_type>;
+
+  static input_type generate_input(std::size_t num_values)
+  {
+    input_type input(num_values);
+    thrust::fill(input.begin(), input.end(), value_type{2});
+    return input;
+  }
+};
+
+template <typename value_type, typename alternate_binary_op = thrust::maximum<>>
+struct invoker
+    : device_vector_fill<value_type>
+    , testing::async::mixin::output::device_vector<value_type>
+    , testing::async::exclusive_scan::mixin::postfix_args::
+        all_overloads<value_type, alternate_binary_op>
+    , testing::async::exclusive_scan::mixin::invoke_reference::host_synchronous<
+        value_type>
+    , testing::async::exclusive_scan::mixin::invoke_async::simple
+    , testing::async::mixin::compare_outputs::assert_almost_equal_if_fp_quiet
+{
+  static std::string description()
+  {
+    return "scan with large value types.";
+  }
+};
+
+struct test_large_types
+{
+  void operator()(std::size_t num_values) const
+  {
+    using testing::async::test_policy_overloads;
+
+    test_policy_overloads<invoker<FixedVector<int, 1>>>::run(num_values);
+    test_policy_overloads<invoker<FixedVector<int, 8>>>::run(num_values);
+    test_policy_overloads<invoker<FixedVector<int, 32>>>::run(num_values);
+    test_policy_overloads<invoker<FixedVector<int, 64>>>::run(num_values);
+  }
+};
+DECLARE_UNITTEST(test_large_types);
+
+#endif // C++14
diff --git a/testing/async/exclusive_scan/mixed_types.cu b/testing/async/exclusive_scan/mixed_types.cu
new file mode 100644
index 000000000..f69af1794
--- /dev/null
+++ b/testing/async/exclusive_scan/mixed_types.cu
@@ -0,0 +1,120 @@
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <async/test_policy_overloads.h>
+
+#include <async/exclusive_scan/mixin.h>
+
+// Test using mixed int/float types for:
+// - input_value_type       | (int, float)
+// - output_value_type      | (int, float)
+// - initial_value_type     | (int, float, <none>)
+// - thrust::plus<T> T-type | (int, float, void>)
+//
+// The initial_value_type and thrust::plus<T> types are covered by the
+// mixin::postfix_args::scan_mixed_types_overloads component.
+//
+// The testing/scan.cu TestMixedTypes test spells out the expected behavior,
+// which is defined by https://wg21.link/P0571.
+
+namespace
+{
+
+template <typename value_type>
+struct mixed_type_input_generator
+{
+  using input_type = thrust::device_vector<value_type>;
+
+  static input_type generate_input(std::size_t num_values)
+  {
+    input_type input(num_values);
+    thrust::sequence(input.begin(),
+                     input.end(),
+                     // fractional values are chosen deliberately to test
+                     // casting orders and accumulator types:
+                     static_cast<value_type>(1.5),
+                     static_cast<value_type>(1));
+    return input;
+  }
+};
+
+// A fractional value is used to ensure that a different result is obtained when
+// using float vs. int.
+template <typename value_type>
+struct mixed_types_postfix_args
+{
+  using postfix_args_type = std::tuple<         // Overloads to test:
+    std::tuple<>,                               // - no extra args
+    std::tuple<value_type>,                     // - initial_value
+    std::tuple<value_type, thrust::plus<>>,     // - initial_value, plus<>
+    std::tuple<value_type, thrust::plus<int>>,  // - initial_value, plus<int>
+    std::tuple<value_type, thrust::plus<float>> // - initial_value, plus<float>
+    >;
+
+  static postfix_args_type generate_postfix_args()
+  {
+    return postfix_args_type{
+      std::tuple<>{},
+      std::make_tuple(static_cast<value_type>(5.5)),
+      std::make_tuple(static_cast<value_type>(5.5), thrust::plus<>{}),
+      std::make_tuple(static_cast<value_type>(5.5), thrust::plus<int>{}),
+      std::make_tuple(static_cast<value_type>(5.5), thrust::plus<float>{})};
+  }
+};
+
+template <typename input_value_type,
+          typename output_value_type,
+          typename initial_value_type>
+struct invoker
+    : mixed_type_input_generator<input_value_type>
+    , testing::async::mixin::output::device_vector<output_value_type>
+    , mixed_types_postfix_args<initial_value_type>
+    , testing::async::exclusive_scan::mixin::invoke_reference::
+        host_synchronous<input_value_type, output_value_type>
+    , testing::async::exclusive_scan::mixin::invoke_async::simple
+    // Use almost_equal instead of almost_equal_if_fp because floating point
+    // addition may be hidden in the scan_op (thrust::plus<float> is always
+    // tested).
+    , testing::async::mixin::compare_outputs::assert_almost_equal
+{
+  static std::string description()
+  {
+    return "mixed input/output/initial type tests";
+  }
+};
+
+} // namespace
+
+void test_scan_mixed_types(size_t num_values)
+{
+  // Since fp addition is non-associative, the results may be slightly off
+  // from the reference.
+  // This is primarily handled by using `compare_almost_equal` to do a fuzzy
+  // comparison. But for large enough test sizes, eventually the scan results
+  // will wrap for integral value_types. If a float accumulator is used, the
+  // small errors from non-associative addition may cause the wrap to happen in
+  // a different location, resulting in an error too large for almost_equal to
+  // ignore.
+  // This wrap seems to happen around 2^16 values, so skip when num_values is
+  // close to that.
+  if (num_values > ((1ll << 16) - 10))
+  {
+    return;
+  }
+
+  // invoker template params are input_value_type, output_vt, initial_vt:
+  using testing::async::test_policy_overloads;
+  test_policy_overloads<invoker<int, int, int>>::run(num_values);
+  test_policy_overloads<invoker<int, int, float>>::run(num_values);
+  test_policy_overloads<invoker<int, float, int>>::run(num_values);
+  test_policy_overloads<invoker<int, float, float>>::run(num_values);
+  test_policy_overloads<invoker<float, int, int>>::run(num_values);
+  test_policy_overloads<invoker<float, int, float>>::run(num_values);
+  test_policy_overloads<invoker<float, float, int>>::run(num_values);
+  // We all float down here
+  test_policy_overloads<invoker<float, float, float>>::run(num_values);
+}
+DECLARE_SIZED_UNITTEST(test_scan_mixed_types);
+
+#endif // C++14
diff --git a/testing/async/exclusive_scan/mixin.h b/testing/async/exclusive_scan/mixin.h
new file mode 100644
index 000000000..02ac9908f
--- /dev/null
+++ b/testing/async/exclusive_scan/mixin.h
@@ -0,0 +1,119 @@
+#pragma once
+
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <thrust/async/scan.h>
+
+#include <thrust/scan.h>
+
+#include <async/mixin.h>
+
+namespace testing
+{
+namespace async
+{
+namespace exclusive_scan
+{
+
+namespace mixin
+{
+
+//------------------------------------------------------------------------------
+namespace postfix_args
+{
+
+template <typename value_type, typename alternate_binary_op = thrust::maximum<>>
+struct all_overloads
+{
+  using postfix_args_type = std::tuple<         // List any extra arg overloads:
+    std::tuple<>,                               // - no extra args
+    std::tuple<value_type>,                     // - initial_value
+    std::tuple<value_type, alternate_binary_op> // - initial_value, binary_op
+    >;
+
+  static postfix_args_type generate_postfix_args()
+  {
+    return postfix_args_type{std::tuple<>{},
+                             std::make_tuple(value_type{42}),
+                             std::make_tuple(value_type{42},
+                                             alternate_binary_op{})};
+  }
+};
+
+} // namespace postfix_args
+
+//------------------------------------------------------------------------------
+namespace invoke_reference
+{
+
+template <typename input_value_type,
+          typename output_value_type = input_value_type>
+struct host_synchronous
+{
+  template <typename InputType,
+            typename OutputType,
+            typename PostfixArgTuple,
+            std::size_t... PostfixArgIndices>
+  static void invoke_reference(InputType const& input,
+                               OutputType& output,
+                               PostfixArgTuple&& postfix_tuple,
+                               std::index_sequence<PostfixArgIndices...>)
+  {
+    // Create host versions of the input/output:
+    thrust::host_vector<input_value_type> host_input(input.cbegin(),
+                                                     input.cend());
+    thrust::host_vector<output_value_type> host_output(host_input.size());
+
+    // Run host synchronous algorithm to generate reference.
+    thrust::exclusive_scan(host_input.cbegin(),
+                           host_input.cend(),
+                           host_output.begin(),
+                           std::get<PostfixArgIndices>(
+                             THRUST_FWD(postfix_tuple))...);
+
+    // Copy back to device.
+    output = host_output;
+  }
+};
+
+} // namespace invoke_reference
+
+//------------------------------------------------------------------------------
+namespace invoke_async
+{
+
+struct simple
+{
+  template <typename PrefixArgTuple,
+            std::size_t... PrefixArgIndices,
+            typename InputType,
+            typename OutputType,
+            typename PostfixArgTuple,
+            std::size_t... PostfixArgIndices>
+  static auto invoke_async(PrefixArgTuple&& prefix_tuple,
+                           std::index_sequence<PrefixArgIndices...>,
+                           InputType const& input,
+                           OutputType& output,
+                           PostfixArgTuple&& postfix_tuple,
+                           std::index_sequence<PostfixArgIndices...>)
+  {
+    auto e = thrust::async::exclusive_scan(
+      std::get<PrefixArgIndices>(THRUST_FWD(prefix_tuple))...,
+      input.cbegin(),
+      input.cend(),
+      output.begin(),
+      std::get<PostfixArgIndices>(THRUST_FWD(postfix_tuple))...);
+    return e;
+  }
+};
+
+} // namespace invoke_async
+
+} // namespace mixin
+} // namespace exclusive_scan
+} // namespace async
+} // namespace testing
+
+#endif // C++14
diff --git a/testing/async/exclusive_scan/simple.cu b/testing/async/exclusive_scan/simple.cu
new file mode 100644
index 000000000..8c55052d7
--- /dev/null
+++ b/testing/async/exclusive_scan/simple.cu
@@ -0,0 +1,72 @@
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <async/test_policy_overloads.h>
+
+#include <async/exclusive_scan/mixin.h>
+
+template <typename input_value_type,
+          typename output_value_type   = input_value_type,
+          typename initial_value_type  = input_value_type,
+          typename alternate_binary_op = thrust::maximum<>>
+struct simple_invoker
+    : testing::async::mixin::input::device_vector<input_value_type>
+    , testing::async::mixin::output::device_vector<output_value_type>
+    , testing::async::exclusive_scan::mixin::postfix_args::
+        all_overloads<initial_value_type, alternate_binary_op>
+    , testing::async::exclusive_scan::mixin::invoke_reference::
+        host_synchronous<input_value_type, output_value_type>
+    , testing::async::exclusive_scan::mixin::invoke_async::simple
+    , testing::async::mixin::compare_outputs::assert_almost_equal_if_fp_quiet
+{
+  static std::string description()
+  {
+    return "simple invocation with device vectors";
+  }
+};
+
+template <typename T>
+struct test_simple
+{
+  void operator()(std::size_t num_values) const
+  {
+    testing::async::test_policy_overloads<simple_invoker<T>>::run(num_values);
+  }
+};
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(test_simple, NumericTypes);
+
+// Testing the in-place algorithm uses the exact same instantiations of the
+// underlying scan implementation as above. Test them here to avoid compiling
+// them twice.
+template <typename input_value_type,
+          typename initial_value_type  = input_value_type,
+          typename alternate_binary_op = thrust::maximum<>>
+struct simple_inplace_invoker
+    : testing::async::mixin::input::device_vector<input_value_type>
+    , testing::async::mixin::output::device_vector_reuse_input<input_value_type>
+    , testing::async::exclusive_scan::mixin::postfix_args::
+        all_overloads<initial_value_type, alternate_binary_op>
+    , testing::async::exclusive_scan::mixin::invoke_reference::host_synchronous<
+        input_value_type>
+    , testing::async::exclusive_scan::mixin::invoke_async::simple
+    , testing::async::mixin::compare_outputs::assert_almost_equal_if_fp_quiet
+{
+  static std::string description()
+  {
+    return "simple in-place invocation with device vectors";
+  }
+};
+
+template <typename T>
+struct test_simple_in_place
+{
+  void operator()(std::size_t num_values) const
+  {
+    using invoker = simple_inplace_invoker<T>;
+    testing::async::test_policy_overloads<invoker>::run(num_values);
+  }
+};
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(test_simple_in_place, NumericTypes);
+
+#endif // C++14
diff --git a/testing/async/exclusive_scan/stateful_operator.cu b/testing/async/exclusive_scan/stateful_operator.cu
new file mode 100644
index 000000000..411ffbd99
--- /dev/null
+++ b/testing/async/exclusive_scan/stateful_operator.cu
@@ -0,0 +1,62 @@
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <async/test_policy_overloads.h>
+
+#include <async/exclusive_scan/mixin.h>
+
+namespace
+{
+
+// Custom binary operator for scan:
+template <typename T>
+struct stateful_operator
+{
+  T offset;
+
+  __host__ __device__ T operator()(T v1, T v2) { return v1 + v2 + offset; }
+};
+
+// Postfix args overload definition that uses a stateful custom binary operator
+template <typename value_type>
+struct use_stateful_operator
+{
+  using postfix_args_type = std::tuple<                   // Single overload:
+    std::tuple<value_type, stateful_operator<value_type>> // init_val, bin_op
+    >;
+
+  static postfix_args_type generate_postfix_args()
+  {
+    return postfix_args_type{
+      std::make_tuple(value_type{42},
+                      stateful_operator<value_type>{value_type{2}})};
+  }
+};
+
+template <typename value_type>
+struct invoker
+    : testing::async::mixin::input::device_vector<value_type>
+    , testing::async::mixin::output::device_vector<value_type>
+    , use_stateful_operator<value_type>
+    , testing::async::exclusive_scan::mixin::invoke_reference::host_synchronous<
+        value_type>
+    , testing::async::exclusive_scan::mixin::invoke_async::simple
+    , testing::async::mixin::compare_outputs::assert_almost_equal_if_fp_quiet
+{
+  static std::string description() { return "scan with stateful operator"; }
+};
+
+} // namespace
+
+template <typename T>
+struct test_stateful_operator
+{
+  void operator()(std::size_t num_values) const
+  {
+    testing::async::test_policy_overloads<invoker<T>>::run(num_values);
+  }
+};
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(test_stateful_operator, NumericTypes);
+
+#endif // C++14
diff --git a/testing/async/exclusive_scan/using_vs_adl.cu b/testing/async/exclusive_scan/using_vs_adl.cu
new file mode 100644
index 000000000..34a80bd79
--- /dev/null
+++ b/testing/async/exclusive_scan/using_vs_adl.cu
@@ -0,0 +1,171 @@
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <async/test_policy_overloads.h>
+
+#include <async/exclusive_scan/mixin.h>
+
+// Verify what happens when calling the algorithm without any namespace
+// qualifiers:
+// - If the async entry point is available in the global namespace due to a
+//   using statement, the async algorithm should be called.
+// - Otherwise, ADL should resolve the call to the synchronous algo in the
+//   thrust:: namespace.
+
+namespace invoke_reference
+{
+
+template <typename input_value_type,
+          typename output_value_type = input_value_type>
+struct adl_host_synchronous
+{
+  template <typename InputType,
+            typename OutputType,
+            typename PostfixArgTuple,
+            std::size_t... PostfixArgIndices>
+  static void invoke_reference(InputType const& input,
+                               OutputType& output,
+                               PostfixArgTuple&& postfix_tuple,
+                               std::index_sequence<PostfixArgIndices...>)
+  {
+    // Create host versions of the input/output:
+    thrust::host_vector<input_value_type> host_input(input.cbegin(),
+                                                     input.cend());
+    thrust::host_vector<output_value_type> host_output(host_input.size());
+
+    using OutIter = thrust::remove_cvref_t<decltype(host_output.begin())>;
+
+    // ADL should resolve this to the synchronous `thrust::` algorithm.
+    // This is checked by ensuring that the call returns an output iterator.
+    OutIter result =
+      exclusive_scan(host_input.cbegin(),
+                     host_input.cend(),
+                     host_output.begin(),
+                     std::get<PostfixArgIndices>(THRUST_FWD(postfix_tuple))...);
+    (void)result;
+
+    // Copy back to device.
+    output = host_output;
+  }
+};
+
+} // namespace invoke_reference
+
+namespace invoke_async
+{
+
+struct using_namespace
+{
+  template <typename PrefixArgTuple,
+            std::size_t... PrefixArgIndices,
+            typename InputType,
+            typename OutputType,
+            typename PostfixArgTuple,
+            std::size_t... PostfixArgIndices>
+  static auto invoke_async(PrefixArgTuple&& prefix_tuple,
+                           std::index_sequence<PrefixArgIndices...>,
+                           InputType const& input,
+                           OutputType& output,
+                           PostfixArgTuple&& postfix_tuple,
+                           std::index_sequence<PostfixArgIndices...>)
+  {
+    // Importing the CPO into the current namespace should unambiguously resolve
+    // this call to the CPO, as opposed to resolving to the thrust:: algorithm
+    // via ADL. This is verified by checking that an event is returned.
+    using namespace thrust::async;
+    thrust::device_event e =
+      exclusive_scan(std::get<PrefixArgIndices>(THRUST_FWD(prefix_tuple))...,
+                     input.cbegin(),
+                     input.cend(),
+                     output.begin(),
+                     std::get<PostfixArgIndices>(THRUST_FWD(postfix_tuple))...);
+    return e;
+  }
+};
+
+struct using_cpo
+{
+  template <typename PrefixArgTuple,
+            std::size_t... PrefixArgIndices,
+            typename InputType,
+            typename OutputType,
+            typename PostfixArgTuple,
+            std::size_t... PostfixArgIndices>
+  static auto invoke_async(PrefixArgTuple&& prefix_tuple,
+                           std::index_sequence<PrefixArgIndices...>,
+                           InputType const& input,
+                           OutputType& output,
+                           PostfixArgTuple&& postfix_tuple,
+                           std::index_sequence<PostfixArgIndices...>)
+  {
+    // Importing the CPO into the current namespace should unambiguously resolve
+    // this call to the CPO, as opposed to resolving to the thrust:: algorithm
+    // via ADL. This is verified by checking that an event is returned.
+    using thrust::async::exclusive_scan;
+    thrust::device_event e =
+      exclusive_scan(std::get<PrefixArgIndices>(THRUST_FWD(prefix_tuple))...,
+                     input.cbegin(),
+                     input.cend(),
+                     output.begin(),
+                     std::get<PostfixArgIndices>(THRUST_FWD(postfix_tuple))...);
+    return e;
+  }
+};
+
+} // namespace invoke_async
+
+template <typename input_value_type,
+          typename output_value_type   = input_value_type,
+          typename initial_value_type  = input_value_type,
+          typename alternate_binary_op = thrust::maximum<>>
+struct using_namespace_invoker
+    : testing::async::mixin::input::device_vector<input_value_type>
+    , testing::async::mixin::output::device_vector<output_value_type>
+    , testing::async::exclusive_scan::mixin::postfix_args::
+        all_overloads<initial_value_type, alternate_binary_op>
+    , invoke_reference::adl_host_synchronous<input_value_type, output_value_type>
+    , invoke_async::using_namespace
+    , testing::async::mixin::compare_outputs::assert_almost_equal_if_fp_quiet
+{
+  static std::string description()
+  {
+    return "importing async CPO with `using namespace thrust::async`";
+  }
+};
+
+void test_using_namespace()
+{
+  using invoker = using_namespace_invoker<int>;
+  testing::async::test_policy_overloads<invoker>::run(128);
+}
+DECLARE_UNITTEST(test_using_namespace);
+
+template <typename input_value_type,
+          typename output_value_type   = input_value_type,
+          typename initial_value_type  = input_value_type,
+          typename alternate_binary_op = thrust::maximum<>>
+struct using_cpo_invoker
+    : testing::async::mixin::input::device_vector<input_value_type>
+    , testing::async::mixin::output::device_vector<output_value_type>
+    , testing::async::exclusive_scan::mixin::postfix_args::
+        all_overloads<initial_value_type, alternate_binary_op>
+    , invoke_reference::adl_host_synchronous<input_value_type, output_value_type>
+    , invoke_async::using_cpo
+    , testing::async::mixin::compare_outputs::assert_almost_equal_if_fp_quiet
+{
+  static std::string description()
+  {
+    return "importing async CPO with "
+           "`using namespace thrust::async::exclusive_scan`";
+  }
+};
+
+void test_using_cpo()
+{
+  using invoker = using_cpo_invoker<int>;
+  testing::async::test_policy_overloads<invoker>::run(128);
+}
+DECLARE_UNITTEST(test_using_cpo);
+
+#endif // C++14
diff --git a/testing/async/inclusive_scan/counting_iterator.cu b/testing/async/inclusive_scan/counting_iterator.cu
new file mode 100644
index 000000000..fe9fdeb80
--- /dev/null
+++ b/testing/async/inclusive_scan/counting_iterator.cu
@@ -0,0 +1,45 @@
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <async/test_policy_overloads.h>
+
+#include <async/inclusive_scan/mixin.h>
+
+#include <algorithm>
+#include <limits>
+
+template <typename input_value_type,
+          typename output_value_type   = input_value_type,
+          typename alternate_binary_op = thrust::maximum<>>
+struct invoker
+    : testing::async::mixin::input::counting_iterator_from_0<input_value_type>
+    , testing::async::mixin::output::device_vector<output_value_type>
+    , testing::async::inclusive_scan::mixin::postfix_args::
+        all_overloads<alternate_binary_op>
+    , testing::async::inclusive_scan::mixin::invoke_reference::
+        host_synchronous<input_value_type, output_value_type>
+    , testing::async::inclusive_scan::mixin::invoke_async::simple
+    , testing::async::mixin::compare_outputs::assert_almost_equal_if_fp_quiet
+{
+  static std::string description()
+  {
+    return "fancy input iterator (counting_iterator)";
+  }
+};
+
+template <typename T>
+struct test_counting_iterator
+{
+  void operator()(std::size_t num_values) const
+  {
+    num_values = unittest::truncate_to_max_representable<T>(num_values);
+    testing::async::test_policy_overloads<invoker<T>>::run(num_values);
+  }
+};
+// Use built-in types only, counting_iterator doesn't seem to be compatible with
+// the custom_numeric.
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(test_counting_iterator,
+                                          BuiltinNumericTypes);
+
+#endif // C++14
diff --git a/testing/async/inclusive_scan/discard_output.cu b/testing/async/inclusive_scan/discard_output.cu
new file mode 100644
index 000000000..c202de7f0
--- /dev/null
+++ b/testing/async/inclusive_scan/discard_output.cu
@@ -0,0 +1,37 @@
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <async/test_policy_overloads.h>
+
+#include <async/inclusive_scan/mixin.h>
+
+// Compilation test with discard iterators. No runtime validation is actually
+// performed, other than testing whether the algorithm completes without
+// exception.
+
+template <typename input_value_type,
+          typename alternate_binary_op = thrust::maximum<>>
+struct discard_invoker
+    : testing::async::mixin::input::device_vector<input_value_type>
+    , testing::async::mixin::output::discard_iterator
+    , testing::async::inclusive_scan::mixin::postfix_args::
+        all_overloads<alternate_binary_op>
+    , testing::async::mixin::invoke_reference::noop
+    , testing::async::inclusive_scan::mixin::invoke_async::simple
+    , testing::async::mixin::compare_outputs::noop
+{
+  static std::string description() { return "discard output"; }
+};
+
+template <typename T>
+struct test_discard
+{
+  void operator()(std::size_t num_values) const
+  {
+    testing::async::test_policy_overloads<discard_invoker<T>>::run(num_values);
+  }
+};
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(test_discard, NumericTypes);
+
+#endif // C++14
diff --git a/testing/async/inclusive_scan/large_indices.cu b/testing/async/inclusive_scan/large_indices.cu
new file mode 100644
index 000000000..4124cf96d
--- /dev/null
+++ b/testing/async/inclusive_scan/large_indices.cu
@@ -0,0 +1,239 @@
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <async/test_policy_overloads.h>
+
+#include <async/inclusive_scan/mixin.h>
+
+#include <thrust/device_free.h>
+#include <thrust/device_malloc.h>
+#include <thrust/device_ptr.h>
+#include <thrust/optional.h>
+
+#include <thrust/iterator/detail/device_system_tag.h>
+#include <thrust/iterator/detail/iterator_facade_category.h>
+
+#include <cstdint>
+
+// This test is an adaptation of TestInclusiveScanWithBigIndices from scan.cu.
+
+namespace
+{
+
+// Fake iterator that asserts
+// (a) it is written with a sequence and
+// (b) a defined maximum value is written at some point
+//
+// This allows us to test very large problem sizes without actually allocating
+// large amounts of memory that would exceed most devices' capacity.
+struct assert_sequence_iterator
+{
+  using value_type      = std::int64_t;
+  using difference_type = std::int64_t;
+
+  // Defined for thrust::iterator_traits:
+  using pointer           = value_type *;
+  using reference         = assert_sequence_iterator; // weird but convenient
+  using iterator_category = typename thrust::detail::iterator_facade_category<
+    thrust::device_system_tag,
+    thrust::random_access_traversal_tag,
+    value_type,
+    reference>::type;
+
+  std::int64_t expected{0};
+  std::int64_t max{0};
+  mutable thrust::device_ptr<bool> found_max{nullptr};
+  mutable thrust::device_ptr<bool> unexpected_value{nullptr};
+
+  // Should be called on the first iterator generated. This needs to be done
+  // explicitly from the host.
+  void initialize_shared_state()
+  {
+    found_max         = thrust::device_malloc<bool>(1);
+    unexpected_value  = thrust::device_malloc<bool>(1);
+    *found_max        = false;
+    *unexpected_value = false;
+  }
+
+  // Should be called only once on the initialized iterator. This needs to be
+  // done explicitly from the host.
+  void free_shared_state() const
+  {
+    thrust::device_free(found_max);
+    thrust::device_free(unexpected_value);
+    found_max        = nullptr;
+    unexpected_value = nullptr;
+  }
+
+  __host__ __device__ assert_sequence_iterator operator+(difference_type i) const
+  {
+    return clone(expected + i);
+  }
+
+  __host__ __device__ reference operator[](difference_type i) const
+  {
+    return clone(expected + i);
+  }
+
+  // Some weirdness, this iterator acts like its own reference
+  __device__ assert_sequence_iterator operator=(value_type val)
+  {
+    if (val != expected)
+    {
+      printf("Error: expected %lld, got %lld\n", expected, val);
+
+      *unexpected_value = true;
+    }
+    else if (val == max)
+    {
+      *found_max = true;
+    }
+
+    return *this;
+  }
+
+private:
+  __host__ __device__ assert_sequence_iterator
+  clone(value_type new_expected) const
+  {
+    return {new_expected, max, found_max, unexpected_value};
+  }
+};
+
+// output mixin that generates assert_sequence_iterators.
+// Must be paired with validate_assert_sequence_iterators mixin to free
+// shared state.
+struct assert_sequence_output
+{
+  struct output_type
+  {
+    using iterator = assert_sequence_iterator;
+
+    iterator iter;
+
+    explicit output_type(iterator &&it)
+        : iter{std::move(it)}
+    {
+      iter.initialize_shared_state();
+    }
+
+    ~output_type() { iter.free_shared_state(); }
+
+    iterator begin() { return iter; }
+  };
+
+  template <typename InputType>
+  static output_type generate_output(std::size_t num_values, InputType &)
+  {
+    using value_type = typename assert_sequence_iterator::value_type;
+    assert_sequence_iterator it{1,
+                                static_cast<value_type>(num_values),
+                                nullptr,
+                                nullptr};
+    return output_type{std::move(it)};
+  }
+};
+
+struct validate_assert_sequence_iterators
+{
+  using output_t = assert_sequence_output::output_type;
+
+  template <typename EventType>
+  static void compare_outputs(EventType &e,
+                              output_t const &,
+                              output_t const &test)
+  {
+    testing::async::mixin::compare_outputs::detail::basic_event_validation(e);
+
+    ASSERT_EQUAL(*test.iter.unexpected_value, false);
+    ASSERT_EQUAL(*test.iter.found_max, true);
+  }
+};
+
+//------------------------------------------------------------------------------
+// Overloads without custom binary operators use thrust::plus<>, so use
+// constant input iterator to generate the output sequence:
+struct default_bin_op_overloads
+{
+  using postfix_args_type = std::tuple< // List any extra arg overloads:
+    std::tuple<>                        // - no extra args
+    >;
+
+  static postfix_args_type generate_postfix_args()
+  {
+    return std::tuple<std::tuple<>>{};
+  }
+};
+
+struct default_bin_op_invoker
+    : testing::async::mixin::input::constant_iterator_1<std::int64_t>
+    , assert_sequence_output
+    , default_bin_op_overloads
+    , testing::async::mixin::invoke_reference::noop
+    , testing::async::inclusive_scan::mixin::invoke_async::simple
+    , validate_assert_sequence_iterators
+{
+  static std::string description()
+  {
+    return "test large array indices with default binary operator";
+  }
+};
+
+} // end anon namespace
+
+void test_large_indices_default_scan_op()
+{
+  // Test problem sizes around signed/unsigned int max:
+  testing::async::test_policy_overloads<default_bin_op_invoker>::run(1ll << 30);
+  testing::async::test_policy_overloads<default_bin_op_invoker>::run(1ll << 31);
+  testing::async::test_policy_overloads<default_bin_op_invoker>::run(1ll << 32);
+  testing::async::test_policy_overloads<default_bin_op_invoker>::run(1ll << 33);
+}
+DECLARE_UNITTEST(test_large_indices_default_scan_op);
+
+namespace
+{
+
+//------------------------------------------------------------------------------
+// Generate the output sequence using counting iterators and thrust::max<> for
+// custom operator overloads.
+struct custom_bin_op_overloads
+{
+  using postfix_args_type = std::tuple< // List any extra arg overloads:
+    std::tuple<thrust::maximum<>>       // - custom binary op
+    >;
+
+  static postfix_args_type generate_postfix_args()
+  {
+    return postfix_args_type{std::make_tuple(thrust::maximum<>{})};
+  }
+};
+
+struct custom_bin_op_invoker
+    : testing::async::mixin::input::counting_iterator_from_1<std::int64_t>
+    , assert_sequence_output
+    , custom_bin_op_overloads
+    , testing::async::mixin::invoke_reference::noop
+    , testing::async::inclusive_scan::mixin::invoke_async::simple
+    , validate_assert_sequence_iterators
+{
+  static std::string description()
+  {
+    return "test large array indices with custom binary operator";
+  }
+};
+
+} // end anon namespace
+
+void test_large_indices_custom_scan_op()
+{
+  // Test problem sizes around signed/unsigned int max:
+  testing::async::test_policy_overloads<custom_bin_op_invoker>::run(1ll << 30);
+  testing::async::test_policy_overloads<custom_bin_op_invoker>::run(1ll << 31);
+  testing::async::test_policy_overloads<custom_bin_op_invoker>::run(1ll << 32);
+  testing::async::test_policy_overloads<custom_bin_op_invoker>::run(1ll << 33);
+}
+DECLARE_UNITTEST(test_large_indices_custom_scan_op);
+
+#endif // C++14
diff --git a/testing/async/inclusive_scan/large_types.cu b/testing/async/inclusive_scan/large_types.cu
new file mode 100644
index 000000000..00bb8b461
--- /dev/null
+++ b/testing/async/inclusive_scan/large_types.cu
@@ -0,0 +1,58 @@
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <async/test_policy_overloads.h>
+
+#include <async/inclusive_scan/mixin.h>
+
+#include <unittest/special_types.h>
+
+// This test is an adaptation of TestScanWithLargeTypes from scan.cu.
+
+// Need special initialization for the FixedVector type:
+template <typename value_type>
+struct device_vector_fill
+{
+  using input_type = thrust::device_vector<value_type>;
+
+  static input_type generate_input(std::size_t num_values)
+  {
+    input_type input(num_values);
+    thrust::fill(input.begin(), input.end(), value_type{2});
+    return input;
+  }
+};
+
+template <typename value_type, typename alternate_binary_op = thrust::maximum<>>
+struct invoker
+    : device_vector_fill<value_type>
+    , testing::async::mixin::output::device_vector<value_type>
+    , testing::async::inclusive_scan::mixin::postfix_args::
+        all_overloads<alternate_binary_op>
+    , testing::async::inclusive_scan::mixin::invoke_reference::host_synchronous<
+        value_type>
+    , testing::async::inclusive_scan::mixin::invoke_async::simple
+    , testing::async::mixin::compare_outputs::assert_almost_equal_if_fp_quiet
+{
+  static std::string description()
+  {
+    return "scan with large value types.";
+  }
+};
+
+struct test_large_types
+{
+  void operator()(std::size_t num_values) const
+  {
+    using testing::async::test_policy_overloads;
+
+    test_policy_overloads<invoker<FixedVector<int, 1>>>::run(num_values);
+    test_policy_overloads<invoker<FixedVector<int, 8>>>::run(num_values);
+    test_policy_overloads<invoker<FixedVector<int, 32>>>::run(num_values);
+    test_policy_overloads<invoker<FixedVector<int, 64>>>::run(num_values);
+  }
+};
+DECLARE_UNITTEST(test_large_types);
+
+#endif // C++14
diff --git a/testing/async/inclusive_scan/mixed_types.cu b/testing/async/inclusive_scan/mixed_types.cu
new file mode 100644
index 000000000..57931c8d0
--- /dev/null
+++ b/testing/async/inclusive_scan/mixed_types.cu
@@ -0,0 +1,109 @@
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <async/test_policy_overloads.h>
+
+#include <async/inclusive_scan/mixin.h>
+
+// Test using mixed int/float types for:
+// - input_value_type       | (int, float)
+// - output_value_type      | (int, float)
+// - thrust::plus<T> T-type | (int, float, void>)
+//
+// The thrust::plus<T> types are covered by the
+// scan_mixed_types_overloads component.
+//
+// The testing/scan.cu TestMixedTypes test spells out the expected behavior,
+// which is defined by https://wg21.link/P0571.
+
+namespace
+{
+
+template <typename value_type>
+struct mixed_type_input_generator
+{
+  using input_type = thrust::device_vector<value_type>;
+
+  static input_type generate_input(std::size_t num_values)
+  {
+    input_type input(num_values);
+    thrust::sequence(input.begin(),
+                     input.end(),
+                     // fractional values are chosen deliberately to test
+                     // casting orders and accumulator types:
+                     static_cast<value_type>(1.5),
+                     static_cast<value_type>(1));
+    return input;
+  }
+};
+
+// A fractional value is used to ensure that a different result is obtained when
+// using float vs. int.
+struct mixed_types_postfix_args
+{
+  using postfix_args_type = std::tuple<  // Overloads to test:
+    std::tuple<>,                        // - no extra args
+    std::tuple<thrust::plus<>>,          // - plus<>
+    std::tuple<thrust::plus<int>>,       // - plus<int>
+    std::tuple<thrust::plus<float>>      // - plus<float>
+    >;
+
+  static postfix_args_type generate_postfix_args()
+  {
+    return postfix_args_type{std::tuple<>{},
+                             std::make_tuple(thrust::plus<>{}),
+                             std::make_tuple(thrust::plus<int>{}),
+                             std::make_tuple(thrust::plus<float>{})};
+  }
+};
+
+template <typename input_value_type,
+          typename output_value_type>
+struct invoker
+    : mixed_type_input_generator<input_value_type>
+    , testing::async::mixin::output::device_vector<output_value_type>
+    , mixed_types_postfix_args
+    , testing::async::inclusive_scan::mixin::invoke_reference::
+        host_synchronous<input_value_type, output_value_type>
+    , testing::async::inclusive_scan::mixin::invoke_async::simple
+    // Use almost_equal instead of almost_equal_if_fp because floating point
+    // addition may be hidden in the scan_op (thrust::plus<float> is always
+    // tested).
+    , testing::async::mixin::compare_outputs::assert_almost_equal
+{
+  static std::string description()
+  {
+    return "mixed input/output/functor value_type tests";
+  }
+};
+
+} // namespace
+
+void test_scan_mixed_types(size_t num_values)
+{
+  // Since fp addition is non-associative, the results may be slightly off
+  // from the reference.
+  // This is primarily handled by using `compare_almost_equal` to do a fuzzy
+  // comparison. But for large enough test sizes, eventually the scan results
+  // will wrap for integral value_types. If a float accumulator is used, the
+  // small errors from non-associative addition may cause the wrap to happen in
+  // a different location, resulting in an error too large for almost_equal to
+  // ignore.
+  // This wrap seems to happen around 2^16 values, so skip when num_values is
+  // close to that.
+  if (num_values > ((1ll << 16) - 10))
+  {
+    return;
+  }
+
+  // invoker template params are input_value_type, output_vt:
+  using testing::async::test_policy_overloads;
+  test_policy_overloads<invoker<int, int>>::run(num_values);
+  test_policy_overloads<invoker<int, float>>::run(num_values);
+  test_policy_overloads<invoker<float, int>>::run(num_values);
+  test_policy_overloads<invoker<float, float>>::run(num_values);
+}
+DECLARE_SIZED_UNITTEST(test_scan_mixed_types);
+
+#endif // C++14
diff --git a/testing/async/inclusive_scan/mixin.h b/testing/async/inclusive_scan/mixin.h
new file mode 100644
index 000000000..82ecd59b8
--- /dev/null
+++ b/testing/async/inclusive_scan/mixin.h
@@ -0,0 +1,115 @@
+#pragma once
+
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <thrust/async/scan.h>
+
+#include <thrust/scan.h>
+
+#include <async/mixin.h>
+
+namespace testing
+{
+namespace async
+{
+namespace inclusive_scan
+{
+
+namespace mixin
+{
+
+//------------------------------------------------------------------------------
+namespace postfix_args
+{
+
+template <typename alternate_binary_op = thrust::maximum<>>
+struct all_overloads
+{
+  using postfix_args_type = std::tuple< // List any extra arg overloads:
+    std::tuple<>,                       // - no extra args
+    std::tuple<alternate_binary_op>     // - binary_op
+    >;
+
+  static postfix_args_type generate_postfix_args()
+  {
+    return postfix_args_type{std::tuple<>{}, std::make_tuple(alternate_binary_op{})};
+  }
+};
+
+} // namespace postfix_args
+
+//------------------------------------------------------------------------------
+namespace invoke_reference
+{
+
+template <typename input_value_type,
+          typename output_value_type = input_value_type>
+struct host_synchronous
+{
+  template <typename InputType,
+            typename OutputType,
+            typename PostfixArgTuple,
+            std::size_t... PostfixArgIndices>
+  static void invoke_reference(InputType const& input,
+                               OutputType& output,
+                               PostfixArgTuple&& postfix_tuple,
+                               std::index_sequence<PostfixArgIndices...>)
+  {
+    // Create host versions of the input/output:
+    thrust::host_vector<input_value_type> host_input(input.cbegin(),
+                                                     input.cend());
+    thrust::host_vector<output_value_type> host_output(host_input.size());
+
+    // Run host synchronous algorithm to generate reference.
+    thrust::inclusive_scan(host_input.cbegin(),
+                           host_input.cend(),
+                           host_output.begin(),
+                           std::get<PostfixArgIndices>(
+                             THRUST_FWD(postfix_tuple))...);
+
+    // Copy back to device.
+    output = host_output;
+  }
+};
+
+} // namespace invoke_reference
+
+//------------------------------------------------------------------------------
+namespace invoke_async
+{
+
+struct simple
+{
+  template <typename PrefixArgTuple,
+            std::size_t... PrefixArgIndices,
+            typename InputType,
+            typename OutputType,
+            typename PostfixArgTuple,
+            std::size_t... PostfixArgIndices>
+  static auto invoke_async(PrefixArgTuple&& prefix_tuple,
+                           std::index_sequence<PrefixArgIndices...>,
+                           InputType const& input,
+                           OutputType& output,
+                           PostfixArgTuple&& postfix_tuple,
+                           std::index_sequence<PostfixArgIndices...>)
+  {
+    auto e = thrust::async::inclusive_scan(
+      std::get<PrefixArgIndices>(THRUST_FWD(prefix_tuple))...,
+      input.cbegin(),
+      input.cend(),
+      output.begin(),
+      std::get<PostfixArgIndices>(THRUST_FWD(postfix_tuple))...);
+    return e;
+  }
+};
+
+} // namespace invoke_async
+
+} // namespace mixin
+} // namespace inclusive_scan
+} // namespace async
+} // namespace testing
+
+#endif // C++14
diff --git a/testing/async/inclusive_scan/simple.cu b/testing/async/inclusive_scan/simple.cu
new file mode 100644
index 000000000..1256f009b
--- /dev/null
+++ b/testing/async/inclusive_scan/simple.cu
@@ -0,0 +1,70 @@
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <async/test_policy_overloads.h>
+
+#include <async/inclusive_scan/mixin.h>
+
+template <typename input_value_type,
+          typename output_value_type   = input_value_type,
+          typename alternate_binary_op = thrust::maximum<>>
+struct simple_invoker
+    : testing::async::mixin::input::device_vector<input_value_type>
+    , testing::async::mixin::output::device_vector<output_value_type>
+    , testing::async::inclusive_scan::mixin::postfix_args::
+        all_overloads<alternate_binary_op>
+    , testing::async::inclusive_scan::mixin::invoke_reference::
+        host_synchronous<input_value_type, output_value_type>
+    , testing::async::inclusive_scan::mixin::invoke_async::simple
+    , testing::async::mixin::compare_outputs::assert_almost_equal_if_fp_quiet
+{
+  static std::string description()
+  {
+    return "simple invocation with device vectors";
+  }
+};
+
+template <typename T>
+struct test_simple
+{
+  void operator()(std::size_t num_values) const
+  {
+    testing::async::test_policy_overloads<simple_invoker<T>>::run(num_values);
+  }
+};
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(test_simple, NumericTypes);
+
+// Testing the in-place algorithm uses the exact same instantiations of the
+// underlying scan implementation as above. Test them here to avoid compiling
+// them twice.
+template <typename input_value_type,
+          typename alternate_binary_op = thrust::maximum<>>
+struct simple_inplace_invoker
+    : testing::async::mixin::input::device_vector<input_value_type>
+    , testing::async::mixin::output::device_vector_reuse_input<input_value_type>
+    , testing::async::inclusive_scan::mixin::postfix_args::
+        all_overloads<alternate_binary_op>
+    , testing::async::inclusive_scan::mixin::invoke_reference::host_synchronous<
+        input_value_type>
+    , testing::async::inclusive_scan::mixin::invoke_async::simple
+    , testing::async::mixin::compare_outputs::assert_almost_equal_if_fp_quiet
+{
+  static std::string description()
+  {
+    return "simple in-place invocation with device vectors";
+  }
+};
+
+template <typename T>
+struct test_simple_in_place
+{
+  void operator()(std::size_t num_values) const
+  {
+    using invoker = simple_inplace_invoker<T>;
+    testing::async::test_policy_overloads<invoker>::run(num_values);
+  }
+};
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(test_simple_in_place, NumericTypes);
+
+#endif // C++14
diff --git a/testing/async/inclusive_scan/stateful_operator.cu b/testing/async/inclusive_scan/stateful_operator.cu
new file mode 100644
index 000000000..224c29303
--- /dev/null
+++ b/testing/async/inclusive_scan/stateful_operator.cu
@@ -0,0 +1,61 @@
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <async/test_policy_overloads.h>
+
+#include <async/inclusive_scan/mixin.h>
+
+namespace
+{
+
+// Custom binary operator for scan:
+template <typename T>
+struct stateful_operator
+{
+  T offset;
+
+  __host__ __device__ T operator()(T v1, T v2) { return v1 + v2 + offset; }
+};
+
+// Postfix args overload definition that uses a stateful custom binary operator
+template <typename value_type>
+struct use_stateful_operator
+{
+  using postfix_args_type = std::tuple<       // Single overload:
+    std::tuple<stateful_operator<value_type>> // bin_op
+    >;
+
+  static postfix_args_type generate_postfix_args()
+  {
+    return postfix_args_type{
+      std::make_tuple(stateful_operator<value_type>{value_type{2}})};
+  }
+};
+
+template <typename value_type>
+struct invoker
+    : testing::async::mixin::input::device_vector<value_type>
+    , testing::async::mixin::output::device_vector<value_type>
+    , use_stateful_operator<value_type>
+    , testing::async::inclusive_scan::mixin::invoke_reference::host_synchronous<
+        value_type>
+    , testing::async::inclusive_scan::mixin::invoke_async::simple
+    , testing::async::mixin::compare_outputs::assert_almost_equal_if_fp_quiet
+{
+  static std::string description() { return "scan with stateful operator"; }
+};
+
+} // namespace
+
+template <typename T>
+struct test_stateful_operator
+{
+  void operator()(std::size_t num_values) const
+  {
+    testing::async::test_policy_overloads<invoker<T>>::run(num_values);
+  }
+};
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(test_stateful_operator, NumericTypes);
+
+#endif // C++14
diff --git a/testing/async/inclusive_scan/using_vs_adl.cu b/testing/async/inclusive_scan/using_vs_adl.cu
new file mode 100644
index 000000000..9789ce5c9
--- /dev/null
+++ b/testing/async/inclusive_scan/using_vs_adl.cu
@@ -0,0 +1,169 @@
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <async/test_policy_overloads.h>
+
+#include <async/inclusive_scan/mixin.h>
+
+// Verify what happens when calling the algorithm without any namespace
+// qualifiers:
+// - If the async entry point is available in the global namespace due to a
+//   using statement, the async algorithm should be called.
+// - Otherwise, ADL should resolve the call to the synchronous algo in the
+//   thrust:: namespace.
+
+namespace invoke_reference
+{
+
+template <typename input_value_type,
+          typename output_value_type = input_value_type>
+struct adl_host_synchronous
+{
+  template <typename InputType,
+            typename OutputType,
+            typename PostfixArgTuple,
+            std::size_t... PostfixArgIndices>
+  static void invoke_reference(InputType const& input,
+                               OutputType& output,
+                               PostfixArgTuple&& postfix_tuple,
+                               std::index_sequence<PostfixArgIndices...>)
+  {
+    // Create host versions of the input/output:
+    thrust::host_vector<input_value_type> host_input(input.cbegin(),
+                                                     input.cend());
+    thrust::host_vector<output_value_type> host_output(host_input.size());
+
+    using OutIter = thrust::remove_cvref_t<decltype(host_output.begin())>;
+
+    // ADL should resolve this to the synchronous `thrust::` algorithm.
+    // This is checked by ensuring that the call returns an output iterator.
+    OutIter result =
+      inclusive_scan(host_input.cbegin(),
+                     host_input.cend(),
+                     host_output.begin(),
+                     std::get<PostfixArgIndices>(THRUST_FWD(postfix_tuple))...);
+    (void)result;
+
+    // Copy back to device.
+    output = host_output;
+  }
+};
+
+} // namespace invoke_reference
+
+namespace invoke_async
+{
+
+struct using_namespace
+{
+  template <typename PrefixArgTuple,
+            std::size_t... PrefixArgIndices,
+            typename InputType,
+            typename OutputType,
+            typename PostfixArgTuple,
+            std::size_t... PostfixArgIndices>
+  static auto invoke_async(PrefixArgTuple&& prefix_tuple,
+                           std::index_sequence<PrefixArgIndices...>,
+                           InputType const& input,
+                           OutputType& output,
+                           PostfixArgTuple&& postfix_tuple,
+                           std::index_sequence<PostfixArgIndices...>)
+  {
+    // Importing the CPO into the current namespace should unambiguously resolve
+    // this call to the CPO, as opposed to resolving to the thrust:: algorithm
+    // via ADL. This is verified by checking that an event is returned.
+    using namespace thrust::async;
+    thrust::device_event e =
+      inclusive_scan(std::get<PrefixArgIndices>(THRUST_FWD(prefix_tuple))...,
+                     input.cbegin(),
+                     input.cend(),
+                     output.begin(),
+                     std::get<PostfixArgIndices>(THRUST_FWD(postfix_tuple))...);
+    return e;
+  }
+};
+
+struct using_cpo
+{
+  template <typename PrefixArgTuple,
+            std::size_t... PrefixArgIndices,
+            typename InputType,
+            typename OutputType,
+            typename PostfixArgTuple,
+            std::size_t... PostfixArgIndices>
+  static auto invoke_async(PrefixArgTuple&& prefix_tuple,
+                           std::index_sequence<PrefixArgIndices...>,
+                           InputType const& input,
+                           OutputType& output,
+                           PostfixArgTuple&& postfix_tuple,
+                           std::index_sequence<PostfixArgIndices...>)
+  {
+    // Importing the CPO into the current namespace should unambiguously resolve
+    // this call to the CPO, as opposed to resolving to the thrust:: algorithm
+    // via ADL. This is verified by checking that an event is returned.
+    using thrust::async::inclusive_scan;
+    thrust::device_event e =
+      inclusive_scan(std::get<PrefixArgIndices>(THRUST_FWD(prefix_tuple))...,
+                     input.cbegin(),
+                     input.cend(),
+                     output.begin(),
+                     std::get<PostfixArgIndices>(THRUST_FWD(postfix_tuple))...);
+    return e;
+  }
+};
+
+} // namespace invoke_async
+
+template <typename input_value_type,
+          typename output_value_type   = input_value_type,
+          typename alternate_binary_op = thrust::maximum<>>
+struct using_namespace_invoker
+    : testing::async::mixin::input::device_vector<input_value_type>
+    , testing::async::mixin::output::device_vector<output_value_type>
+    , testing::async::inclusive_scan::mixin::postfix_args::
+        all_overloads<alternate_binary_op>
+    , invoke_reference::adl_host_synchronous<input_value_type, output_value_type>
+    , invoke_async::using_namespace
+    , testing::async::mixin::compare_outputs::assert_almost_equal_if_fp_quiet
+{
+  static std::string description()
+  {
+    return "importing async CPO with `using namespace thrust::async`";
+  }
+};
+
+void test_using_namespace()
+{
+  using invoker = using_namespace_invoker<int>;
+  testing::async::test_policy_overloads<invoker>::run(128);
+}
+DECLARE_UNITTEST(test_using_namespace);
+
+template <typename input_value_type,
+          typename output_value_type   = input_value_type,
+          typename alternate_binary_op = thrust::maximum<>>
+struct using_cpo_invoker
+    : testing::async::mixin::input::device_vector<input_value_type>
+    , testing::async::mixin::output::device_vector<output_value_type>
+    , testing::async::inclusive_scan::mixin::postfix_args::
+        all_overloads<alternate_binary_op>
+    , invoke_reference::adl_host_synchronous<input_value_type, output_value_type>
+    , invoke_async::using_cpo
+    , testing::async::mixin::compare_outputs::assert_almost_equal_if_fp_quiet
+{
+  static std::string description()
+  {
+    return "importing async CPO with "
+           "`using namespace thrust::async::inclusive_scan`";
+  }
+};
+
+void test_using_cpo()
+{
+  using invoker = using_cpo_invoker<int>;
+  testing::async::test_policy_overloads<invoker>::run(128);
+}
+DECLARE_UNITTEST(test_using_cpo);
+
+#endif // C++14
diff --git a/testing/async/mixin.h b/testing/async/mixin.h
new file mode 100644
index 000000000..6d1c06ed7
--- /dev/null
+++ b/testing/async/mixin.h
@@ -0,0 +1,663 @@
+#pragma once
+
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/sequence.h>
+
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/discard_iterator.h>
+
+#include <thrust/type_traits/logical_metafunctions.h>
+
+#include <unittest/unittest.h>
+#include <unittest/util_async.h>
+
+#include <tuple>
+#include <type_traits>
+
+// clang-format off
+
+// This file contains a set of mix-in classes that define an algorithm
+// definition for use with test_policy_overloads<algo_def>. The algorithm
+// definition describes the details of a thrust::async algorithm invocation:
+//
+// - Input type and initialization
+// - Output type and initialization (supports in-place, too)
+// - Postfix arguments that define the algorithm's overload set
+// - Abstracted invocation of the async algorithm
+// - Abstracted invocation of a reference algorithm
+// - Validation of async vs. reference output
+// - A description string.
+//
+// This definition is used by test_policy_overloads to test each overload
+// against a reference while injecting a variety of execution policies. This
+// validates that each overload behaves correctly according to some reference.
+//
+// Since much of the algorithm definition is generic and may be reused in
+// multiple tests with slight changes, a mix-in system is used to simplify
+// the creation of algorithm definitions. The following namespace hierarchy is
+// used to organize these generic components:
+//
+// * testing::async::mixin::
+// ** ::input - Input types/values (device vectors, counting iterators, etc)
+// ** ::output - Output types/values (device vectors, inplace device vectors,
+//                                    discard iterators, etc)
+// ** ::postfix_args - Algorithm specific overload sets
+// ** ::invoke_reference - Algorithm specific reference invocation
+// ** ::invoke_async - Algorithm specific async algo invocation
+// ** ::compare_outputs - Compare output values.
+//
+// Each algorithm should define its own `mixins.h` header to declare algorithm
+// specific mixins (e.g. postfix_args, invoke_reference, and invoke_async)
+// in a testing::async::<algorithm_name>::mixins namespace structure.
+//
+// For example, the test.async.exclusive_scan.basic test uses the following
+// algorithm definition from mix-ins:
+//
+// ```
+//   #include <async/test_policy_overloads.h>
+//   #include <async/mixin.h>
+//   #include <async/exclusive_scan/mixin.h>
+//   template <typename input_value_type,
+//            typename output_value_type   = input_value_type,
+//            typename initial_value_type  = input_value_type,
+//            typename alternate_binary_op = thrust::maximum<>>
+//   struct basic_invoker
+//      : testing::async::mixin::input::device_vector<input_value_type>
+//      , testing::async::mixin::output::device_vector<output_value_type>
+//      , testing::async::exclusive_scan::mixin::postfix_args::
+//          all_overloads<initial_value_type, alternate_binary_op>
+//      , testing::async::exclusive_scan::mixin::invoke_reference::
+//          host_synchronous<input_value_type, output_value_type>
+//      , testing::async::exclusive_scan::mixin::invoke_async::basic
+//      , testing::async::mixin::compare_outputs::assert_equal_quiet
+//   {
+//     static std::string description()
+//     {
+//       return "basic invocation with device vectors";
+//     }
+//   };
+//
+//   ...
+//
+//   testing::async::test_policy_overloads<basic_invoker<T>>::run(num_values);
+// ```
+//
+// The basic_invoker class expands to something similar to the following:
+//
+// ```
+//  template <typename input_value_type,
+//            typename output_value_type   = input_value_type,
+//            typename initial_value_type  = input_value_type,
+//            typename alternate_binary_op = thrust::maximum<>>
+//  struct basic_invoker
+//  {
+//  public:
+//
+//    static std::string description()
+//    {
+//      return "basic invocation with device vectors";
+//    }
+//
+//    //-------------------------------------------------------------------------
+//    // testing::async::mixin::input::device_vector
+//    //
+//    // input_type must provide idiomatic definitions of:
+//    // - `using iterator = ...;`
+//    // - `iterator begin() const { ... }`
+//    // - `iterator end() const { ... }`
+//    // - `size_t size() const { ... }`
+//    using input_type = thrust::device_vector<input_value_type>;
+//
+//    // Generate an instance of the input:
+//    static input_type generate_input(std::size_t num_values)
+//    {
+//      input_type input(num_values);
+//      thrust::sequence(input.begin(), input.end(), 25, 3);
+//      return input;
+//    }
+//
+//    //-------------------------------------------------------------------------
+//    // testing::async::mixin::output::device_vector
+//    //
+//    // output_type must provide idiomatic definitions of:
+//    // - `using iterator = ...;`
+//    // - `iterator begin() { ... }`
+//    using output_type = thrust::device_vector<output_value_type>;
+//
+//    // Generate an instance of the output:
+//    // Might be more complicated, eg. fancy iterators, etc
+//    static output_type generate_output(std::size_t num_values)
+//    {
+//      return output_type(num_values);
+//    }
+//
+//    //-------------------------------------------------------------------------
+//    // testing::async::exclusive_scan::mixin::postfix_args::all_overloads
+//    using postfix_args_type = std::tuple<   // List any extra arg overloads:
+//      std::tuple<>,                                       // - no extra args
+//      std::tuple<initial_value_type>,                     // - initial_value
+//      std::tuple<initial_value_type, alternate_binary_op> // - initial_value, binary_op
+//      >;
+//
+//    // Create instances of the extra arguments to use when invoking the
+//    // algorithm:
+//    static postfix_args_type generate_postfix_args()
+//    {
+//      return postfix_args_type{
+//        std::tuple<>{},                            // no extra args
+//        std::make_tuple(initial_value_type{42}),   // initial_value
+//        // initial_value, binary_op:
+//        std::make_tuple(initial_value_Type{57}, alternate_binary_op{})
+//      };
+//    }
+//
+//    //-------------------------------------------------------------------------
+//    //
+//    testing::async::exclusive_scan::mixin::invoke_reference::host_synchronous
+//    //
+//    // Invoke a reference implementation for a single overload as described by
+//    // postfix_tuple. This tuple contains instances of any trailing arguments
+//    // to pass to the algorithm. The tuple/index_sequence pattern is used to
+//    // support a "no extra args" overload, since the parameter pack expansion
+//    // will do exactly what we want in all cases.
+//    template <typename PostfixArgTuple, std::size_t... PostfixArgIndices>
+//    static void invoke_reference(input_type const &input,
+//                                 output_type &output,
+//                                 PostfixArgTuple &&postfix_tuple,
+//                                 std::index_sequence<PostfixArgIndices...>)
+//    {
+//      // Create host versions of the input/output:
+//      thrust::host_vector<input_value_type> host_input(input.cbegin(),
+//                                                       input.cend());
+//      thrust::host_vector<output_value_type> host_output(host_input.size());
+//
+//      // Run host synchronous algorithm to generate reference.
+//      thrust::exclusive_scan(host_input.cbegin(),
+//                             host_input.cend(),
+//                             host_output.begin(),
+//                             std::get<PostfixArgIndices>(
+//                               THRUST_FWD(postfix_tuple))...);
+//
+//      // Copy back to device.
+//      output = host_output;
+//    }
+//
+//    //-------------------------------------------------------------------------
+//    // testing::async::mixin::exclusive_scan::mixin::invoke_async::basic
+//    //
+//    // Invoke the async algorithm for a single overload as described by
+//    // the prefix and postfix tuples. These tuples contains instances of any
+//    // additional arguments to pass to the algorithm. The tuple/index_sequence
+//    // pattern is used to support the "no extra args" overload, since the
+//    // parameter pack expansion will do exactly what we want in all cases.
+//    // Prefix args are included here (but not for invoke_reference) to allow
+//    // the test framework to change the execution policy.
+//    // This method must return an event or future.
+//    template <typename PrefixArgTuple,
+//              std::size_t... PrefixArgIndices,
+//              typename PostfixArgTuple,
+//              std::size_t... PostfixArgIndices>
+//    static auto invoke_async(PrefixArgTuple &&prefix_tuple,
+//                             std::index_sequence<PrefixArgIndices...>,
+//                             input_type const &input,
+//                             output_type &output,
+//                             PostfixArgTuple &&postfix_tuple,
+//                             std::index_sequence<PostfixArgIndices...>)
+//    {
+//      output.resize(input.size());
+//      auto e = thrust::async::exclusive_scan(
+//        std::get<PrefixArgIndices>(THRUST_FWD(prefix_tuple))...,
+//        input.cbegin(),
+//        input.cend(),
+//        output.begin(),
+//        std::get<PostfixArgIndices>(THRUST_FWD(postfix_tuple))...);
+//      return e;
+//    }
+//
+//    //-------------------------------------------------------------------------
+//    // testing::async::mixin::compare_outputs::assert_equal_quiet
+//    //
+//    // Wait on and validate the event/future (usually with TEST_EVENT_WAIT /
+//    // TEST_FUTURE_VALUE_RETRIEVAL), then check that the reference output
+//    // matches the testing output.
+//    template <typename EventType>
+//    static void compare_outputs(EventType &e,
+//                                output_type const &ref,
+//                                output_type const &test)
+//    {
+//      TEST_EVENT_WAIT(e);
+//      ASSERT_EQUAL_QUIET(ref, test);
+//    }
+// };
+// ```
+//
+// Similar invokers with slight tweaks are used in other
+// async/exclusive_scan/*.cu tests.
+
+// clang-format on
+
+namespace testing
+{
+namespace async
+{
+namespace mixin
+{
+
+//------------------------------------------------------------------------------
+namespace input
+{
+
+template <typename value_type>
+struct device_vector
+{
+  using input_type = thrust::device_vector<value_type>;
+
+  static input_type generate_input(std::size_t num_values)
+  {
+    input_type input(num_values);
+    thrust::sequence(input.begin(),
+                     input.end(),
+                     static_cast<value_type>(1),
+                     static_cast<value_type>(1));
+    return input;
+  }
+};
+
+template <typename value_type>
+struct counting_iterator_from_0
+{
+  struct input_type
+  {
+    using iterator = thrust::counting_iterator<value_type>;
+
+    std::size_t num_values;
+
+    iterator begin() const { return iterator{static_cast<value_type>(0)}; }
+    iterator cbegin() const { return iterator{static_cast<value_type>(0)}; }
+
+    iterator end() const { return iterator{static_cast<value_type>(num_values)}; }
+    iterator cend() const { return iterator{static_cast<value_type>(num_values)}; }
+
+    std::size_t size() const { return num_values; }
+  };
+
+  static input_type generate_input(std::size_t num_values)
+  {
+    return {num_values};
+  }
+};
+
+template <typename value_type>
+struct counting_iterator_from_1
+{
+  struct input_type
+  {
+    using iterator = thrust::counting_iterator<value_type>;
+
+    std::size_t num_values;
+
+    iterator begin() const { return iterator{static_cast<value_type>(1)}; }
+    iterator cbegin() const { return iterator{static_cast<value_type>(1)}; }
+
+    iterator end() const { return iterator{static_cast<value_type>(1 + num_values)}; }
+    iterator cend() const { return iterator{static_cast<value_type>(1 + num_values)}; }
+
+    std::size_t size() const { return num_values; }
+  };
+
+  static input_type generate_input(std::size_t num_values)
+  {
+    return {num_values};
+  }
+};
+
+template <typename value_type>
+struct constant_iterator_1
+{
+  struct input_type
+  {
+    using iterator = thrust::constant_iterator<value_type>;
+
+    std::size_t num_values;
+
+    iterator begin() const { return iterator{static_cast<value_type>(1)}; }
+    iterator cbegin() const { return iterator{static_cast<value_type>(1)}; }
+
+    iterator end() const
+    {
+      return iterator{static_cast<value_type>(1)} + num_values;
+    }
+    iterator cend() const
+    {
+      return iterator{static_cast<value_type>(1)} + num_values;
+    }
+
+    std::size_t size() const { return num_values; }
+  };
+
+  static input_type generate_input(std::size_t num_values)
+  {
+    return {num_values};
+  }
+};
+
+} // namespace input
+
+//------------------------------------------------------------------------------
+namespace output
+{
+
+template <typename value_type>
+struct device_vector
+{
+  using output_type = thrust::device_vector<value_type>;
+
+  template <typename InputType>
+  static output_type generate_output(std::size_t num_values,
+                                     InputType& /* unused */)
+  {
+    return output_type(num_values);
+  }
+};
+
+template <typename value_type>
+struct device_vector_reuse_input
+{
+  using output_type = thrust::device_vector<value_type>&;
+
+  template <typename InputType>
+  static output_type generate_output(std::size_t /*num_values*/,
+                                     InputType& input)
+  {
+    return input;
+  }
+};
+
+struct discard_iterator
+{
+  struct output_type
+  {
+    using iterator = thrust::discard_iterator<>;
+
+    iterator begin() const { return thrust::make_discard_iterator(); }
+    iterator cbegin() const { return thrust::make_discard_iterator(); }
+  };
+
+  template <typename InputType>
+  static output_type generate_output(std::size_t /* num_values */,
+                                     InputType& /* input */)
+  {
+    return output_type{};
+  }
+};
+
+} // namespace output
+
+//------------------------------------------------------------------------------
+namespace postfix_args
+{
+/* Defined per algorithm. Example:
+ *
+ * // Defines several overloads:
+ * // algorithm([policy,] input, output) // no postfix args
+ * // algorithm([policy,] input, output, initial_value)
+ * // algorithm([policy,] input, output, initial_value, binary_op)
+ * template <typename value_type,
+ *           typename alternate_binary_op = thrust::maximum<>>
+ * struct all_overloads
+ * {
+ *   using postfix_args_type = std::tuple<     // List any extra arg overloads:
+ *     std::tuple<>,                               // - no extra args
+ *     std::tuple<value_type>,                     // - initial_value
+ *     std::tuple<value_type, alternate_binary_op> // - initial_value, binary_op
+ *     >;
+ *
+ *   static postfix_args_type generate_postfix_args()
+ *   {
+ *     return postfix_args_type{
+ *       std::tuple<>{},                            // no extra args
+ *       std::make_tuple(initial_value_type{42}),   // initial_value
+ *       // initial_value, binary_op:
+ *       std::make_tuple(initial_value_Type{57}, alternate_binary_op{})
+ *   }
+ * };
+ *
+ */
+}
+
+//------------------------------------------------------------------------------
+namespace invoke_reference
+{
+
+/* Defined per algorithm. Example:
+ *
+ * template <typename input_value_type,
+ *           typename output_value_type = input_value_type>
+ * struct host_synchronous
+ * {
+ *   template <typename InputType,
+ *             typename OutputType,
+ *             typename PostfixArgTuple,
+ *             std::size_t... PostfixArgIndices>
+ *   static void invoke_reference(InputType const& input,
+ *                                OutputType& output,
+ *                                PostfixArgTuple&& postfix_tuple,
+ *                                std::index_sequence<PostfixArgIndices...>)
+ *   {
+ *     // Create host versions of the input/output:
+ *     thrust::host_vector<input_value_type> host_input(input.cbegin(),
+ *                                                      input.cend());
+ *     thrust::host_vector<output_value_type> host_output(host_input.size());
+ *
+ *     // Run host synchronous algorithm to generate reference.
+ *     // Be sure to call a backend that doesn't use the same underlying
+ *     // implementation.
+ *     thrust::exclusive_scan(host_input.cbegin(),
+ *                            host_input.cend(),
+ *                            host_output.begin(),
+ *                            std::get<PostfixArgIndices>(
+ *                              THRUST_FWD(postfix_tuple))...);
+ *
+ *     // Copy back to device.
+ *     output = host_output;
+ *   }
+ * };
+ *
+ */
+
+// Used to save time when testing unverifiable invocations (discard_iterators)
+struct noop
+{
+  template <typename... Ts>
+  static void invoke_reference(Ts&&...)
+  {}
+};
+
+} // namespace invoke_reference
+
+//------------------------------------------------------------------------------
+namespace invoke_async
+{
+
+/* Defined per algorithm. Example:
+ *
+ * struct basic
+ * {
+ *   template <typename PrefixArgTuple,
+ *             std::size_t... PrefixArgIndices,
+ *             typename InputType,
+ *             typename OutputType,
+ *             typename PostfixArgTuple,
+ *             std::size_t... PostfixArgIndices>
+ *   static auto invoke_async(PrefixArgTuple&& prefix_tuple,
+ *                            std::index_sequence<PrefixArgIndices...>,
+ *                            InputType const& input,
+ *                            OutputType& output,
+ *                            PostfixArgTuple&& postfix_tuple,
+ *                            std::index_sequence<PostfixArgIndices...>)
+ *   {
+ *     auto e = thrust::async::exclusive_scan(
+ *       std::get<PrefixArgIndices>(THRUST_FWD(prefix_tuple))...,
+ *       input.cbegin(),
+ *       input.cend(),
+ *       output.begin(),
+ *       std::get<PostfixArgIndices>(THRUST_FWD(postfix_tuple))...);
+ *     return e;
+ *   }
+ * };
+ */
+
+} // namespace invoke_async
+
+//------------------------------------------------------------------------------
+namespace compare_outputs
+{
+
+namespace detail
+{
+
+void basic_event_validation(thrust::device_event& e)
+{
+  TEST_EVENT_WAIT(e);
+}
+
+template <typename T>
+void basic_event_validation(thrust::device_future<T>& f)
+{
+  TEST_FUTURE_VALUE_RETRIEVAL(f);
+}
+
+} // namespace detail
+
+struct assert_equal
+{
+  template <typename EventType, typename OutputType>
+  static void compare_outputs(EventType& e,
+                              OutputType const& ref,
+                              OutputType const& test)
+  {
+    detail::basic_event_validation(e);
+    ASSERT_EQUAL(ref, test);
+  }
+};
+
+struct assert_almost_equal
+{
+  template <typename EventType, typename OutputType>
+  static void compare_outputs(EventType& e,
+                              OutputType const& ref,
+                              OutputType const& test)
+  {
+    detail::basic_event_validation(e);
+    ASSERT_ALMOST_EQUAL(ref, test);
+  }
+};
+
+// Does an 'almost_equal' comparison for floating point types. Since fp
+// addition is non-associative, this is sometimes necessary.
+struct assert_almost_equal_if_fp
+{
+private:
+  template <typename EventType, typename OutputType>
+  static void compare_outputs_impl(EventType& e,
+                                   OutputType const& ref,
+                                   OutputType const& test,
+                                   std::false_type /* is_floating_point */)
+  {
+    detail::basic_event_validation(e);
+    ASSERT_EQUAL(ref, test);
+  }
+
+  template <typename EventType, typename OutputType>
+  static void compare_outputs_impl(EventType& e,
+                                   OutputType const& ref,
+                                   OutputType const& test,
+                                   std::true_type /* is_floating_point */)
+  {
+    detail::basic_event_validation(e);
+    ASSERT_ALMOST_EQUAL(ref, test);
+  }
+
+public:
+  template <typename EventType, typename OutputType>
+  static void compare_outputs(EventType& e,
+                              OutputType const& ref,
+                              OutputType const& test)
+  {
+    using value_type = typename OutputType::value_type;
+    compare_outputs_impl(e, ref, test, std::is_floating_point<value_type>{});
+  }
+};
+
+struct assert_equal_quiet
+{
+  template <typename EventType, typename OutputType>
+  static void compare_outputs(EventType& e,
+                              OutputType const& ref,
+                              OutputType const& test)
+  {
+    detail::basic_event_validation(e);
+    ASSERT_EQUAL_QUIET(ref, test);
+  }
+};
+
+// Does an 'almost_equal' comparison for floating point types, since fp
+// addition is non-associative
+struct assert_almost_equal_if_fp_quiet
+{
+private:
+  template <typename EventType, typename OutputType>
+  static void compare_outputs_impl(EventType& e,
+                                   OutputType const& ref,
+                                   OutputType const& test,
+                                   std::false_type /* is_floating_point */)
+  {
+    detail::basic_event_validation(e);
+    ASSERT_EQUAL_QUIET(ref, test);
+  }
+
+  template <typename EventType, typename OutputType>
+  static void compare_outputs_impl(EventType& e,
+                                   OutputType const& ref,
+                                   OutputType const& test,
+                                   std::true_type /* is_floating_point */)
+  {
+    detail::basic_event_validation(e);
+    ASSERT_ALMOST_EQUAL(ref, test);
+  }
+
+public:
+  template <typename EventType, typename OutputType>
+  static void compare_outputs(EventType& e,
+                              OutputType const& ref,
+                              OutputType const& test)
+  {
+    using value_type = typename OutputType::value_type;
+    compare_outputs_impl(e, ref, test, std::is_floating_point<value_type>{});
+  }
+};
+
+// Used to save time when testing unverifiable invocations (discard_iterators).
+// Just does basic validation of the future/event.
+struct noop
+{
+  template <typename EventType, typename... Ts>
+  static void compare_outputs(EventType &e, Ts&&...)
+  {
+    detail::basic_event_validation(e);
+  }
+};
+
+} // namespace compare_outputs
+
+} // namespace mixin
+} // namespace async
+} // namespace testing
+
+#endif // C++14
diff --git a/testing/async/test_policy_overloads.h b/testing/async/test_policy_overloads.h
new file mode 100644
index 000000000..b7bf1ab94
--- /dev/null
+++ b/testing/async/test_policy_overloads.h
@@ -0,0 +1,410 @@
+#pragma once
+
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <thrust/device_allocator.h>
+#include <thrust/future.h>
+
+#include <unittest/unittest.h>
+
+#include <string>
+
+// TODO Cover these cases from testing/async_reduce.cu:
+//   - [x] test_async_reduce_after ("after_future" in test_policy_overloads)
+//   - [ ] test_async_reduce_on_then_after (KNOWN_FAILURE, see #1195)
+//     - [ ] all the child variants (e.g. with allocator) too
+//   - [ ] test_async_copy_then_reduce (Need to figure out how to fit this in)
+//   - [ ] test_async_reduce_caching (only useful when returning future)
+
+namespace testing
+{
+
+namespace async
+{
+
+// Tests that policies are handled correctly for all overloads of an async
+// algorithm.
+//
+// The AlgoDef parameter type defines an async algorithm, its overloads, and
+// abstracts its invocation. See the async/mixins.h for a documented example of
+// this interface and some convenience mixins that can be used to construct a
+// definition quickly.
+//
+// The AlgoDef interface is used to run several tests of the algorithm,
+// exhaustively testing all overloads for algorithm correctness and proper
+// policy handling.
+//
+// ## Basic tests
+//
+// In the basic tests, each overload is called repeatedly with:
+// 1) No policy
+// 2) thrust::device
+// 3) thrust::device(thrust::device_allocator<void>)
+// 4) thrust::device.on(stream)
+// 5) thrust::device(thrust::device_allocator<void>).on(stream)
+//
+// The output of the async algorithm is compared against a reference output,
+// and the returned event/future is tested to make sure it holds a reference to
+// the expected stream.
+//
+// ## After Future tests
+//
+// The after_future tests check that the future/event returned from an algorithm
+// behaves properly when consumed by a policy's `.after` method.
+template <typename AlgoDef>
+struct test_policy_overloads
+{
+  using algo_def          = AlgoDef;
+  using input_type        = typename algo_def::input_type;
+  using output_type       = typename algo_def::output_type;
+  using postfix_args_type = typename algo_def::postfix_args_type;
+
+  static constexpr std::size_t num_postfix_arg_sets =
+    std::tuple_size<postfix_args_type>::value;
+
+  // Main entry point; call this from a unit test function.
+  static void run(std::size_t num_values)
+  {
+    test_postfix_overloads(num_values);
+  }
+
+private:
+  template <std::size_t Size>
+  using size_const = std::integral_constant<std::size_t, Size>;
+
+  //----------------------------------------------------------------------------
+  // Recursively call sub tests for each overload set in postfix_args:
+  template <std::size_t PostfixIdx = 0>
+  static void test_postfix_overloads(std::size_t const num_values,
+                                     size_const<PostfixIdx> = {})
+  {
+    static_assert(PostfixIdx < num_postfix_arg_sets, "Internal error.");
+
+    run_basic_policy_tests<PostfixIdx>(num_values);
+    run_after_future_tests<PostfixIdx>(num_values);
+
+    // Recurse to test next round of overloads:
+    test_postfix_overloads(num_values, size_const<PostfixIdx + 1>{});
+  }
+
+  static void test_postfix_overloads(std::size_t const,
+                                     size_const<num_postfix_arg_sets>)
+  {
+    // terminal case, no-op
+  }
+
+  //----------------------------------------------------------------------------
+  // For the specified postfix overload set, test the algorithm with several
+  // different policy configurations.
+  template <std::size_t PostfixIdx>
+  static void run_basic_policy_tests(std::size_t const num_values)
+  {
+    // When a policy uses the default stream, the algorithm implementation
+    // should spawn a new stream in the returned event:
+    auto using_default_stream = [](auto& e) {
+      ASSERT_NOT_EQUAL(thrust::cuda_cub::default_stream(),
+                       e.stream().native_handle());
+    };
+
+    // When a policy uses a non-default stream, the implementation should pass
+    // the stream through to the output:
+    thrust::system::cuda::detail::unique_stream test_stream{};
+    auto using_test_stream = [&test_stream](auto& e) {
+      ASSERT_EQUAL(test_stream.native_handle(), e.stream().native_handle());
+    };
+
+    // Test the different types of policies:
+    basic_policy_test<PostfixIdx>("(no policy)",
+                                   std::make_tuple(),
+                                   using_default_stream,
+                                   num_values);
+
+    basic_policy_test<PostfixIdx>("thrust::device",
+                                   std::make_tuple(thrust::device),
+                                   using_default_stream,
+                                   num_values);
+
+    basic_policy_test<PostfixIdx>(
+      "thrust::device(thrust::device_allocator<void>{})",
+      std::make_tuple(thrust::device(thrust::device_allocator<void>{})),
+      using_default_stream,
+      num_values);
+
+    basic_policy_test<PostfixIdx>("thrust::device.on(test_stream.get())",
+                                   std::make_tuple(
+                                     thrust::device.on(test_stream.get())),
+                                   using_test_stream,
+                                   num_values);
+
+    basic_policy_test<PostfixIdx>(
+      "thrust::device(thrust::device_allocator<void>{}).on(test_stream.get())",
+      std::make_tuple(
+        thrust::device(thrust::device_allocator<void>{}).on(test_stream.get())),
+      using_test_stream,
+      num_values);
+  }
+
+  // Invoke the algorithm multiple times with the provided policy and validate
+  // the results.
+  template <std::size_t PostfixIdx,
+            typename PrefixArgTuple,
+            typename ValidateEvent>
+  static void basic_policy_test(std::string const &policy_desc,
+                                PrefixArgTuple &&prefix_tuple_ref,
+                                ValidateEvent const &validate,
+                                std::size_t num_values)
+  try
+  {
+    // Sink the prefix tuple into a const local so it can be safely passed to
+    // multiple invocations without worrying about potential modifications.
+    using prefix_tuple_type = thrust::remove_cvref_t<PrefixArgTuple>;
+    prefix_tuple_type const prefix_tuple = THRUST_FWD(prefix_tuple_ref);
+
+    using postfix_tuple_type =
+      std::tuple_element_t<PostfixIdx, postfix_args_type>;
+    postfix_tuple_type const postfix_tuple = get_postfix_tuple<PostfixIdx>();
+
+    // Generate index sequences for the tuples:
+    constexpr auto prefix_tuple_size  = std::tuple_size<prefix_tuple_type>{};
+    constexpr auto postfix_tuple_size = std::tuple_size<postfix_tuple_type>{};
+    using prefix_index_seq  = std::make_index_sequence<prefix_tuple_size>;
+    using postfix_index_seq = std::make_index_sequence<postfix_tuple_size>;
+
+    // Use unique, non-const inputs for each invocation to support in-place
+    // algo_def configurations.
+    input_type input_a   = algo_def::generate_input(num_values);
+    input_type input_b   = algo_def::generate_input(num_values);
+    input_type input_c   = algo_def::generate_input(num_values);
+    input_type input_d   = algo_def::generate_input(num_values);
+    input_type input_ref = algo_def::generate_input(num_values);
+
+    output_type output_a   = algo_def::generate_output(num_values, input_a);
+    output_type output_b   = algo_def::generate_output(num_values, input_b);
+    output_type output_c   = algo_def::generate_output(num_values, input_c);
+    output_type output_d   = algo_def::generate_output(num_values, input_d);
+    output_type output_ref = algo_def::generate_output(num_values, input_ref);
+
+    // Invoke multiple overlapping async algorithms, capturing their outputs
+    // and events/futures:
+    auto e_a = algo_def::invoke_async(prefix_tuple,
+                                      prefix_index_seq{},
+                                      input_a,
+                                      output_a,
+                                      postfix_tuple,
+                                      postfix_index_seq{});
+    auto e_b = algo_def::invoke_async(prefix_tuple,
+                                      prefix_index_seq{},
+                                      input_b,
+                                      output_b,
+                                      postfix_tuple,
+                                      postfix_index_seq{});
+    auto e_c = algo_def::invoke_async(prefix_tuple,
+                                      prefix_index_seq{},
+                                      input_c,
+                                      output_c,
+                                      postfix_tuple,
+                                      postfix_index_seq{});
+    auto e_d = algo_def::invoke_async(prefix_tuple,
+                                      prefix_index_seq{},
+                                      input_d,
+                                      output_d,
+                                      postfix_tuple,
+                                      postfix_index_seq{});
+
+    // Let reference calc overlap with async testing:
+    algo_def::invoke_reference(input_ref,
+                               output_ref,
+                               postfix_tuple,
+                               postfix_index_seq{});
+
+    // These wait on the e_X events:
+    algo_def::compare_outputs(e_a, output_ref, output_a);
+    algo_def::compare_outputs(e_b, output_ref, output_b);
+    algo_def::compare_outputs(e_c, output_ref, output_c);
+    algo_def::compare_outputs(e_d, output_ref, output_d);
+
+    validate(e_a);
+    validate(e_b);
+    validate(e_c);
+    validate(e_d);
+  }
+  catch (unittest::UnitTestException &exc)
+  {
+    // Append some identifying information to the exception to help with
+    // debugging:
+    using overload_t = std::tuple_element_t<PostfixIdx, postfix_args_type>;
+
+    std::string const overload_desc =
+      unittest::demangle(typeid(overload_t).name());
+    std::string const input_desc =
+      unittest::demangle(typeid(input_type).name());
+    std::string const output_desc =
+      unittest::demangle(typeid(output_type).name());
+
+    exc << "\n"
+        << " - algo_def::description = " << algo_def::description() << "\n"
+        << " - test = basic_policy\n"
+        << " - policy = " << policy_desc << "\n"
+        << " - input_type = " << input_desc << "\n"
+        << " - output_type = " << output_desc << "\n"
+        << " - tuple of trailing arguments = " << overload_desc << "\n"
+        << " - num_values = " << num_values;
+    throw;
+  }
+
+  //----------------------------------------------------------------------------
+  // Test .after(event/future) handling:
+  template <std::size_t PostfixIdx>
+  static void run_after_future_tests(std::size_t const num_values)
+  try
+  {
+    using postfix_tuple_type =
+    std::tuple_element_t<PostfixIdx, postfix_args_type>;
+    postfix_tuple_type const postfix_tuple = get_postfix_tuple<PostfixIdx>();
+
+    // Generate index sequences for the tuples. Prefix size always = 1 here,
+    // since the async algorithms are always invoked with a single prefix
+    // arg (the execution policy) here.
+    constexpr auto postfix_tuple_size = std::tuple_size<postfix_tuple_type>{};
+    using prefix_index_seq  = std::make_index_sequence<1>;
+    using postfix_index_seq = std::make_index_sequence<postfix_tuple_size>;
+
+    // Use unique, non-const inputs for each invocation to support in-place
+    // algo_def configurations.
+    input_type input_a   = algo_def::generate_input(num_values);
+    input_type input_b   = algo_def::generate_input(num_values);
+    input_type input_c   = algo_def::generate_input(num_values);
+    input_type input_tmp = algo_def::generate_input(num_values);
+    input_type input_ref = algo_def::generate_input(num_values);
+
+    output_type output_a   = algo_def::generate_output(num_values, input_a);
+    output_type output_b   = algo_def::generate_output(num_values, input_b);
+    output_type output_c   = algo_def::generate_output(num_values, input_c);
+    output_type output_tmp = algo_def::generate_output(num_values, input_tmp);
+    output_type output_ref = algo_def::generate_output(num_values, input_ref);
+
+    auto e_a = algo_def::invoke_async(std::make_tuple(thrust::device),
+                                      prefix_index_seq{},
+                                      input_a,
+                                      output_a,
+                                      postfix_tuple,
+                                      postfix_index_seq{});
+    ASSERT_EQUAL(true, e_a.valid_stream());
+    auto const stream_a = e_a.stream().native_handle();
+
+    // Execution on default stream should create a new stream in the result:
+    ASSERT_NOT_EQUAL_QUIET(thrust::cuda_cub::default_stream(), stream_a);
+
+    //--------------------------------------------------------------------------
+    // Test event consumption when the event is an rvalue.
+    //--------------------------------------------------------------------------
+    // Using `forward_as_tuple` instead of `make_tuple` to explicitly control
+    // value categories.
+    // Explicitly order this invocation after e_a:
+    auto e_b =
+      algo_def::invoke_async(std::forward_as_tuple(thrust::device.after(e_a)),
+                             prefix_index_seq{},
+                             input_b,
+                             output_b,
+                             postfix_tuple,
+                             postfix_index_seq{});
+    ASSERT_EQUAL(true, e_b.valid_stream());
+    auto const stream_b = e_b.stream().native_handle();
+
+    // Second invocation should use same stream as before:
+    ASSERT_EQUAL_QUIET(stream_a, stream_b);
+
+    // Verify that double consumption of e_a produces an exception:
+    ASSERT_THROWS_EQUAL(auto x = algo_def::invoke_async(
+                          std::forward_as_tuple(thrust::device.after(e_a)),
+                          prefix_index_seq{},
+                          input_tmp,
+                          output_tmp,
+                          postfix_tuple,
+                          postfix_index_seq{});
+                        THRUST_UNUSED_VAR(x),
+                        thrust::event_error,
+                        thrust::event_error(thrust::event_errc::no_state));
+
+    //--------------------------------------------------------------------------
+    // Test event consumption when the event is an lvalue
+    //--------------------------------------------------------------------------
+    // Explicitly order this invocation after e_b:
+    auto policy_after_e_b = thrust::device.after(e_b);
+    auto policy_after_e_b_tuple = std::forward_as_tuple(policy_after_e_b);
+    auto e_c =
+      algo_def::invoke_async(policy_after_e_b_tuple,
+                             prefix_index_seq{},
+                             input_c,
+                             output_c,
+                             postfix_tuple,
+                             postfix_index_seq{});
+    ASSERT_EQUAL(true, e_c.valid_stream());
+    auto const stream_c = e_c.stream().native_handle();
+
+    // Should use same stream as e_b:
+    ASSERT_EQUAL_QUIET(stream_b, stream_c);
+
+    // Verify that double consumption of e_b produces an exception:
+    ASSERT_THROWS_EQUAL(
+      auto x = algo_def::invoke_async(policy_after_e_b_tuple,
+                                      prefix_index_seq{},
+                                      input_tmp,
+                                      output_tmp,
+                                      postfix_tuple,
+                                      postfix_index_seq{});
+      THRUST_UNUSED_VAR(x),
+      thrust::event_error,
+      thrust::event_error(thrust::event_errc::no_state));
+
+    // Let reference calc overlap with async testing:
+    algo_def::invoke_reference(input_ref,
+                               output_ref,
+                               postfix_tuple,
+                               postfix_index_seq{});
+
+    // Validate results
+    // Use e_c for all three checks -- e_a and e_b will not pass the event
+    // checks since their streams were stolen by dependencies.
+    algo_def::compare_outputs(e_c, output_ref, output_a);
+    algo_def::compare_outputs(e_c, output_ref, output_b);
+    algo_def::compare_outputs(e_c, output_ref, output_c);
+  }
+  catch (unittest::UnitTestException &exc)
+  {
+    // Append some identifying information to the exception to help with
+    // debugging:
+    using postfix_t = std::tuple_element_t<PostfixIdx, postfix_args_type>;
+
+    std::string const postfix_desc =
+      unittest::demangle(typeid(postfix_t).name());
+    std::string const input_desc =
+      unittest::demangle(typeid(input_type).name());
+    std::string const output_desc =
+      unittest::demangle(typeid(output_type).name());
+
+    exc << "\n"
+        << " - algo_def::description = " << algo_def::description() << "\n"
+        << " - test = after_future\n"
+        << " - input_type = " << input_desc << "\n"
+        << " - output_type = " << output_desc << "\n"
+        << " - tuple of trailing arguments = " << postfix_desc << "\n"
+        << " - num_values = " << num_values;
+    throw;
+  }
+
+  //----------------------------------------------------------------------------
+  // Various helper functions:
+  template <std::size_t PostfixIdx>
+  static auto get_postfix_tuple()
+  {
+    return std::get<PostfixIdx>(algo_def::generate_postfix_args());
+  }
+};
+
+} // namespace async
+} // namespace testing
+
+#endif // C++14
diff --git a/testing/async_copy.cu b/testing/async_copy.cu
index 338b94e1a..2666a6c38 100644
--- a/testing/async_copy.cu
+++ b/testing/async_copy.cu
@@ -1,6 +1,6 @@
 #include <thrust/detail/config.h>
 
-#if THRUST_CPP_DIALECT >= 2011 && !defined(THRUST_LEGACY_GCC)
+#if THRUST_CPP_DIALECT >= 2014
 
 #include <unittest/unittest.h>
 #include <unittest/util_async.h>
@@ -18,7 +18,7 @@
     auto operator()(                                                          \
       ForwardIt&& first, Sentinel&& last, OutputIt&& output                   \
     ) const                                                                   \
-    THRUST_DECLTYPE_RETURNS(                                                  \
+    THRUST_RETURNS(                                                           \
       ::thrust::async::copy(                                                  \
         __VA_ARGS__                                                           \
         THRUST_PP_COMMA_IF(THRUST_PP_ARITY(__VA_ARGS__))                      \
@@ -101,7 +101,7 @@ struct test_async_copy_device_to_host
     void operator()(std::size_t n)
     {
       thrust::host_vector<T>   h0(unittest::random_integers<T>(n));
-      thrust::device_vector<T> h1(n);
+      thrust::host_vector<T>   h1(n);
       thrust::device_vector<T> d0(n);
 
       thrust::copy(h0.begin(), h0.end(), d0.begin());
@@ -267,6 +267,11 @@ struct test_async_copy_counting_iterator_input_to_host_vector
       f0.wait();
 
       ASSERT_EQUAL(d0, d1);
+
+      #if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_INTEL)
+      // ICC fails this for some unknown reason - see #1468.
+      KNOWN_FAILURE;
+      #endif
     }
   };
 };
@@ -319,6 +324,84 @@ DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
 
 ///////////////////////////////////////////////////////////////////////////////
 
+template <typename T>
+struct test_async_copy_after
+{
+  __host__
+  void operator()(std::size_t n)
+  {
+    thrust::host_vector<T>   h0(unittest::random_integers<T>(n));
+    thrust::host_vector<T>   h1(n);
+    thrust::device_vector<T> d0(n);
+    thrust::device_vector<T> d1(n);
+    thrust::device_vector<T> d2(n);
+
+    auto e0 = thrust::async::copy(
+      h0.begin(), h0.end(), d0.begin()
+    );
+
+    ASSERT_EQUAL(true, e0.valid_stream());
+
+    auto const e0_stream = e0.stream().native_handle();
+
+    auto e1 = thrust::async::copy(
+      thrust::device.after(e0), d0.begin(), d0.end(), d1.begin()
+    );
+
+    // Verify that double consumption of a future produces an exception.
+    ASSERT_THROWS_EQUAL(
+      auto x = thrust::async::copy(
+        thrust::device.after(e0), d0.begin(), d0.end(), d1.begin()
+      );
+      THRUST_UNUSED_VAR(x)
+    , thrust::event_error
+    , thrust::event_error(thrust::event_errc::no_state)
+    );
+
+    ASSERT_EQUAL_QUIET(e0_stream, e1.stream().native_handle());
+
+    auto after_policy2 = thrust::device.after(e1);
+
+    auto e2 = thrust::async::copy(
+      thrust::host, after_policy2
+    , h0.begin(), h0.end(), d2.begin()
+    );
+
+    // Verify that double consumption of a policy produces an exception.
+    ASSERT_THROWS_EQUAL(
+      auto x = thrust::async::copy(
+        thrust::host, after_policy2
+      , h0.begin(), h0.end(), d2.begin()
+      );
+      THRUST_UNUSED_VAR(x)
+    , thrust::event_error
+    , thrust::event_error(thrust::event_errc::no_state)
+    );
+
+    ASSERT_EQUAL_QUIET(e0_stream, e2.stream().native_handle());
+
+    auto e3 = thrust::async::copy(
+      thrust::device.after(e2), thrust::host
+    , d1.begin(), d1.end(), h1.begin()
+    );
+
+    ASSERT_EQUAL_QUIET(e0_stream, e3.stream().native_handle());
+
+    TEST_EVENT_WAIT(e3);
+
+    ASSERT_EQUAL(h0, h1);
+    ASSERT_EQUAL(h0, d0);
+    ASSERT_EQUAL(h0, d1);
+    ASSERT_EQUAL(h0, d2);
+  }
+};
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(
+  test_async_copy_after
+, BuiltinNumericTypes
+);
+
+///////////////////////////////////////////////////////////////////////////////
+
 // TODO: device_to_device NonContiguousIterator output (discard_iterator).
 
 // TODO: host_to_device non trivially relocatable.
diff --git a/testing/async_for_each.cu b/testing/async_for_each.cu
index 7ed033e9e..a09adf255 100644
--- a/testing/async_for_each.cu
+++ b/testing/async_for_each.cu
@@ -1,6 +1,6 @@
 #include <thrust/detail/config.h>
 
-#if THRUST_CPP_DIALECT >= 2011 && !defined(THRUST_LEGACY_GCC)
+#if THRUST_CPP_DIALECT >= 2014
 
 #include <unittest/unittest.h>
 
@@ -16,7 +16,7 @@
     auto operator()(                                                          \
       ForwardIt&& first, Sentinel&& last, UnaryFunction&& f                   \
     ) const                                                                   \
-    THRUST_DECLTYPE_RETURNS(                                                  \
+    THRUST_RETURNS(                                                           \
       ::thrust::async::for_each(                                              \
         __VA_ARGS__                                                           \
         THRUST_PP_COMMA_IF(THRUST_PP_ARITY(__VA_ARGS__))                      \
diff --git a/testing/async_reduce.cmake b/testing/async_reduce.cmake
new file mode 100644
index 000000000..44c0fbda1
--- /dev/null
+++ b/testing/async_reduce.cmake
@@ -0,0 +1,4 @@
+# Disable unreachable code warnings.
+# This test unconditionally throws in some places, the compiler will detect that
+# control flow will never reach some instructions. This is intentional.
+target_link_libraries(${test_target} PRIVATE thrust.silence_unreachable_code_warnings)
diff --git a/testing/async_reduce.cu b/testing/async_reduce.cu
index 5987fe6ae..c033c2311 100644
--- a/testing/async_reduce.cu
+++ b/testing/async_reduce.cu
@@ -2,7 +2,7 @@
 
 #include <thrust/detail/config.h>
 
-#if THRUST_CPP_DIALECT >= 2011 && !defined(THRUST_LEGACY_GCC)
+#if THRUST_CPP_DIALECT >= 2014
 
 #include <unittest/unittest.h>
 #include <unittest/util_async.h>
@@ -76,7 +76,7 @@ struct custom_plus
     auto operator()(                                                          \
       ForwardIt&& first, Sentinel&& last                                      \
     )                                                                         \
-    THRUST_DECLTYPE_RETURNS(                                                  \
+    THRUST_RETURNS(                                                           \
       ::thrust::reduce(                                                       \
         __VA_ARGS__                                                           \
       )                                                                       \
@@ -975,6 +975,8 @@ struct test_async_reduce_allocator_on_then_after
     KNOWN_FAILURE;
     // FIXME: The below fails because you can't combine allocator attachment,
     // `.on`, and `.after`.
+    // The `#if 0` can be removed once the KNOWN_FAILURE is resolved.
+#if 0
     ASSERT_EQUAL_QUIET(stream1, f2.stream().native_handle());
 
     // This potentially runs concurrently with the copies.
@@ -986,6 +988,7 @@ struct test_async_reduce_allocator_on_then_after
 
     thrust::cuda_cub::throw_on_error(cudaStreamDestroy(stream0));
     thrust::cuda_cub::throw_on_error(cudaStreamDestroy(stream1));
+#endif
   }
 };
 DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(
diff --git a/testing/async_reduce_into.cu b/testing/async_reduce_into.cu
index 0800a1a50..a4a2be99e 100644
--- a/testing/async_reduce_into.cu
+++ b/testing/async_reduce_into.cu
@@ -2,7 +2,7 @@
 
 #include <thrust/detail/config.h>
 
-#if THRUST_CPP_DIALECT >= 2011 && !defined(THRUST_LEGACY_GCC)
+#if THRUST_CPP_DIALECT >= 2014
 
 #include <unittest/unittest.h>
 #include <unittest/util_async.h>
@@ -77,7 +77,7 @@ struct custom_plus
     auto operator()(                                                          \
       ForwardIt&& first, Sentinel&& last                                      \
     )                                                                         \
-    THRUST_DECLTYPE_RETURNS(                                                  \
+    THRUST_RETURNS(                                                           \
       ::thrust::reduce(                                                       \
         __VA_ARGS__                                                           \
       )                                                                       \
diff --git a/testing/async_sort.cu b/testing/async_sort.cu
index 626e21c3c..c5cfeae23 100644
--- a/testing/async_sort.cu
+++ b/testing/async_sort.cu
@@ -1,6 +1,13 @@
 #include <thrust/detail/config.h>
 
-#if THRUST_CPP_DIALECT >= 2011 && !defined(THRUST_LEGACY_GCC)
+// Disabled on MSVC && NVCC < 11.1 for GH issue #1098.
+#if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC) && defined(__CUDACC__)
+#if (__CUDACC_VER_MAJOR__ < 11) || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ < 1)
+#define THRUST_BUG_1098_ACTIVE
+#endif // NVCC version check
+#endif // MSVC + NVCC check
+
+#if THRUST_CPP_DIALECT >= 2014 && !defined(THRUST_BUG_1098_ACTIVE)
 
 #include <unittest/unittest.h>
 
@@ -48,7 +55,7 @@ struct custom_greater
     static auto async(                                                        \
       ForwardIt&& first, Sentinel&& last                                      \
     )                                                                         \
-    THRUST_DECLTYPE_RETURNS(                                                  \
+    THRUST_RETURNS(                                                           \
       ::thrust::async::sort(                                                  \
         __VA_ARGS__                                                           \
         THRUST_PP_COMMA_IF(THRUST_PP_ARITY(__VA_ARGS__))                      \
@@ -89,7 +96,7 @@ DEFINE_SORT_INVOKER(
     static auto async(                                                        \
       ForwardIt&& first, Sentinel&& last                                      \
     )                                                                         \
-    THRUST_DECLTYPE_RETURNS(                                                  \
+    THRUST_RETURNS(                                                           \
       ::thrust::async::sort(                                                  \
         __VA_ARGS__                                                           \
         THRUST_PP_COMMA_IF(THRUST_PP_ARITY(__VA_ARGS__))                      \
@@ -147,7 +154,7 @@ struct test_async_sort
         d0_data.begin(), d0_data.end()
       );
 
-      if (wait_for_futures == WaitPolicy)
+      THRUST_IF_CONSTEXPR(wait_for_futures == WaitPolicy)
       {
         f0.wait();
 
diff --git a/testing/async_transform.cu b/testing/async_transform.cu
index 328a4e563..efaa885f0 100644
--- a/testing/async_transform.cu
+++ b/testing/async_transform.cu
@@ -1,6 +1,6 @@
 #include <thrust/detail/config.h>
 
-#if THRUST_CPP_DIALECT >= 2011 && !defined(THRUST_LEGACY_GCC)
+#if THRUST_CPP_DIALECT >= 2014
 
 #include <unittest/unittest.h>
 #include <unittest/util_async.h>
@@ -78,7 +78,7 @@ struct divide_by_2
       ForwardIt&& first, Sentinel&& last, OutputIt&& output                   \
     , UnaryOperation&& op                                                     \
     )                                                                         \
-    THRUST_DECLTYPE_RETURNS(                                                  \
+    THRUST_RETURNS(                                                           \
       ::thrust::transform(                                                    \
         __VA_ARGS__                                                           \
       )                                                                       \
diff --git a/testing/binary_search.cu b/testing/binary_search.cu
index d83e6acbc..2aceb8645 100644
--- a/testing/binary_search.cu
+++ b/testing/binary_search.cu
@@ -291,3 +291,57 @@ void TestScalarEqualRangeDispatchImplicit()
 DECLARE_UNITTEST(TestScalarEqualRangeDispatchImplicit);
 
 THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_END
+
+void TestBoundsWithBigIndexesHelper(int magnitude)
+{
+    thrust::counting_iterator<long long> begin(1);
+    thrust::counting_iterator<long long> end = begin + (1ll << magnitude);
+    ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude);
+
+    thrust::detail::intmax_t distance_low_value = thrust::distance(
+        begin,
+        thrust::lower_bound(
+            thrust::device,
+            begin,
+            end,
+            17));
+
+    thrust::detail::intmax_t distance_high_value = thrust::distance(
+        begin,
+        thrust::lower_bound(
+            thrust::device,
+            begin,
+            end,
+            (1ll << magnitude) - 17));
+
+    ASSERT_EQUAL(distance_low_value, 16);
+    ASSERT_EQUAL(distance_high_value, (1ll << magnitude) - 18);
+
+    distance_low_value = thrust::distance(
+        begin,
+        thrust::upper_bound(
+            thrust::device,
+            begin,
+            end,
+            17));
+
+    distance_high_value = thrust::distance(
+        begin,
+        thrust::upper_bound(
+            thrust::device,
+            begin,
+            end,
+            (1ll << magnitude) - 17));
+
+    ASSERT_EQUAL(distance_low_value, 17);
+    ASSERT_EQUAL(distance_high_value, (1ll << magnitude) - 17);
+}
+
+void TestBoundsWithBigIndexes()
+{
+    TestBoundsWithBigIndexesHelper(30);
+    TestBoundsWithBigIndexesHelper(31);
+    TestBoundsWithBigIndexesHelper(32);
+    TestBoundsWithBigIndexesHelper(33);
+}
+DECLARE_UNITTEST(TestBoundsWithBigIndexes);
diff --git a/testing/binary_search_descending.cu b/testing/binary_search_descending.cu
index 5228c4567..08294c044 100644
--- a/testing/binary_search_descending.cu
+++ b/testing/binary_search_descending.cu
@@ -22,16 +22,16 @@ void TestScalarLowerBoundDescendingSimple(void)
     vec[3] = 2;
     vec[4] = 0;
 
-    ASSERT_EQUAL_QUIET(vec.begin() + 4, thrust::lower_bound(vec.begin(), vec.end(), 0, thrust::greater<T>()));
-    ASSERT_EQUAL_QUIET(vec.begin() + 4, thrust::lower_bound(vec.begin(), vec.end(), 1, thrust::greater<T>()));
-    ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::lower_bound(vec.begin(), vec.end(), 2, thrust::greater<T>()));
-    ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::lower_bound(vec.begin(), vec.end(), 3, thrust::greater<T>()));
-    ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::lower_bound(vec.begin(), vec.end(), 4, thrust::greater<T>()));
-    ASSERT_EQUAL_QUIET(vec.begin() + 2, thrust::lower_bound(vec.begin(), vec.end(), 5, thrust::greater<T>()));
-    ASSERT_EQUAL_QUIET(vec.begin() + 2, thrust::lower_bound(vec.begin(), vec.end(), 6, thrust::greater<T>()));
-    ASSERT_EQUAL_QUIET(vec.begin() + 1, thrust::lower_bound(vec.begin(), vec.end(), 7, thrust::greater<T>()));
-    ASSERT_EQUAL_QUIET(vec.begin() + 0, thrust::lower_bound(vec.begin(), vec.end(), 8, thrust::greater<T>()));
-    ASSERT_EQUAL_QUIET(vec.begin() + 0, thrust::lower_bound(vec.begin(), vec.end(), 9, thrust::greater<T>()));
+    ASSERT_EQUAL_QUIET(vec.begin() + 4, thrust::lower_bound(vec.begin(), vec.end(), T{0}, thrust::greater<T>()));
+    ASSERT_EQUAL_QUIET(vec.begin() + 4, thrust::lower_bound(vec.begin(), vec.end(), T{1}, thrust::greater<T>()));
+    ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::lower_bound(vec.begin(), vec.end(), T{2}, thrust::greater<T>()));
+    ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::lower_bound(vec.begin(), vec.end(), T{3}, thrust::greater<T>()));
+    ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::lower_bound(vec.begin(), vec.end(), T{4}, thrust::greater<T>()));
+    ASSERT_EQUAL_QUIET(vec.begin() + 2, thrust::lower_bound(vec.begin(), vec.end(), T{5}, thrust::greater<T>()));
+    ASSERT_EQUAL_QUIET(vec.begin() + 2, thrust::lower_bound(vec.begin(), vec.end(), T{6}, thrust::greater<T>()));
+    ASSERT_EQUAL_QUIET(vec.begin() + 1, thrust::lower_bound(vec.begin(), vec.end(), T{7}, thrust::greater<T>()));
+    ASSERT_EQUAL_QUIET(vec.begin() + 0, thrust::lower_bound(vec.begin(), vec.end(), T{8}, thrust::greater<T>()));
+    ASSERT_EQUAL_QUIET(vec.begin() + 0, thrust::lower_bound(vec.begin(), vec.end(), T{9}, thrust::greater<T>()));
 }
 DECLARE_VECTOR_UNITTEST(TestScalarLowerBoundDescendingSimple);
 
@@ -49,16 +49,16 @@ void TestScalarUpperBoundDescendingSimple(void)
     vec[3] = 2;
     vec[4] = 0;
 
-    ASSERT_EQUAL_QUIET(vec.begin() + 5, thrust::upper_bound(vec.begin(), vec.end(), 0, thrust::greater<T>()));
-    ASSERT_EQUAL_QUIET(vec.begin() + 4, thrust::upper_bound(vec.begin(), vec.end(), 1, thrust::greater<T>()));
-    ASSERT_EQUAL_QUIET(vec.begin() + 4, thrust::upper_bound(vec.begin(), vec.end(), 2, thrust::greater<T>()));
-    ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::upper_bound(vec.begin(), vec.end(), 3, thrust::greater<T>()));
-    ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::upper_bound(vec.begin(), vec.end(), 4, thrust::greater<T>()));
-    ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::upper_bound(vec.begin(), vec.end(), 5, thrust::greater<T>()));
-    ASSERT_EQUAL_QUIET(vec.begin() + 2, thrust::upper_bound(vec.begin(), vec.end(), 6, thrust::greater<T>()));
-    ASSERT_EQUAL_QUIET(vec.begin() + 2, thrust::upper_bound(vec.begin(), vec.end(), 7, thrust::greater<T>()));
-    ASSERT_EQUAL_QUIET(vec.begin() + 1, thrust::upper_bound(vec.begin(), vec.end(), 8, thrust::greater<T>()));
-    ASSERT_EQUAL_QUIET(vec.begin() + 0, thrust::upper_bound(vec.begin(), vec.end(), 9, thrust::greater<T>()));
+    ASSERT_EQUAL_QUIET(vec.begin() + 5, thrust::upper_bound(vec.begin(), vec.end(), T{0}, thrust::greater<T>()));
+    ASSERT_EQUAL_QUIET(vec.begin() + 4, thrust::upper_bound(vec.begin(), vec.end(), T{1}, thrust::greater<T>()));
+    ASSERT_EQUAL_QUIET(vec.begin() + 4, thrust::upper_bound(vec.begin(), vec.end(), T{2}, thrust::greater<T>()));
+    ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::upper_bound(vec.begin(), vec.end(), T{3}, thrust::greater<T>()));
+    ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::upper_bound(vec.begin(), vec.end(), T{4}, thrust::greater<T>()));
+    ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::upper_bound(vec.begin(), vec.end(), T{5}, thrust::greater<T>()));
+    ASSERT_EQUAL_QUIET(vec.begin() + 2, thrust::upper_bound(vec.begin(), vec.end(), T{6}, thrust::greater<T>()));
+    ASSERT_EQUAL_QUIET(vec.begin() + 2, thrust::upper_bound(vec.begin(), vec.end(), T{7}, thrust::greater<T>()));
+    ASSERT_EQUAL_QUIET(vec.begin() + 1, thrust::upper_bound(vec.begin(), vec.end(), T{8}, thrust::greater<T>()));
+    ASSERT_EQUAL_QUIET(vec.begin() + 0, thrust::upper_bound(vec.begin(), vec.end(), T{9}, thrust::greater<T>()));
 }
 DECLARE_VECTOR_UNITTEST(TestScalarUpperBoundDescendingSimple);
 
@@ -76,16 +76,16 @@ void TestScalarBinarySearchDescendingSimple(void)
     vec[3] = 2;
     vec[4] = 0;
 
-    ASSERT_EQUAL(true,  thrust::binary_search(vec.begin(), vec.end(), 0, thrust::greater<T>()));
-    ASSERT_EQUAL(false, thrust::binary_search(vec.begin(), vec.end(), 1, thrust::greater<T>()));
-    ASSERT_EQUAL(true,  thrust::binary_search(vec.begin(), vec.end(), 2, thrust::greater<T>()));
-    ASSERT_EQUAL(false, thrust::binary_search(vec.begin(), vec.end(), 3, thrust::greater<T>()));
-    ASSERT_EQUAL(false, thrust::binary_search(vec.begin(), vec.end(), 4, thrust::greater<T>()));
-    ASSERT_EQUAL(true,  thrust::binary_search(vec.begin(), vec.end(), 5, thrust::greater<T>()));
-    ASSERT_EQUAL(false, thrust::binary_search(vec.begin(), vec.end(), 6, thrust::greater<T>()));
-    ASSERT_EQUAL(true,  thrust::binary_search(vec.begin(), vec.end(), 7, thrust::greater<T>()));
-    ASSERT_EQUAL(true,  thrust::binary_search(vec.begin(), vec.end(), 8, thrust::greater<T>()));
-    ASSERT_EQUAL(false, thrust::binary_search(vec.begin(), vec.end(), 9, thrust::greater<T>()));
+    ASSERT_EQUAL(true,  thrust::binary_search(vec.begin(), vec.end(), T{0}, thrust::greater<T>()));
+    ASSERT_EQUAL(false, thrust::binary_search(vec.begin(), vec.end(), T{1}, thrust::greater<T>()));
+    ASSERT_EQUAL(true,  thrust::binary_search(vec.begin(), vec.end(), T{2}, thrust::greater<T>()));
+    ASSERT_EQUAL(false, thrust::binary_search(vec.begin(), vec.end(), T{3}, thrust::greater<T>()));
+    ASSERT_EQUAL(false, thrust::binary_search(vec.begin(), vec.end(), T{4}, thrust::greater<T>()));
+    ASSERT_EQUAL(true,  thrust::binary_search(vec.begin(), vec.end(), T{5}, thrust::greater<T>()));
+    ASSERT_EQUAL(false, thrust::binary_search(vec.begin(), vec.end(), T{6}, thrust::greater<T>()));
+    ASSERT_EQUAL(true,  thrust::binary_search(vec.begin(), vec.end(), T{7}, thrust::greater<T>()));
+    ASSERT_EQUAL(true,  thrust::binary_search(vec.begin(), vec.end(), T{8}, thrust::greater<T>()));
+    ASSERT_EQUAL(false, thrust::binary_search(vec.begin(), vec.end(), T{9}, thrust::greater<T>()));
 }
 DECLARE_VECTOR_UNITTEST(TestScalarBinarySearchDescendingSimple);
 
@@ -103,27 +103,27 @@ void TestScalarEqualRangeDescendingSimple(void)
     vec[3] = 2;
     vec[4] = 0;
 
-    ASSERT_EQUAL_QUIET(vec.begin() + 4, thrust::equal_range(vec.begin(), vec.end(), 0, thrust::greater<T>()).first);
-    ASSERT_EQUAL_QUIET(vec.begin() + 4, thrust::equal_range(vec.begin(), vec.end(), 1, thrust::greater<T>()).first);
-    ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::equal_range(vec.begin(), vec.end(), 2, thrust::greater<T>()).first);
-    ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::equal_range(vec.begin(), vec.end(), 3, thrust::greater<T>()).first);
-    ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::equal_range(vec.begin(), vec.end(), 4, thrust::greater<T>()).first);
-    ASSERT_EQUAL_QUIET(vec.begin() + 2, thrust::equal_range(vec.begin(), vec.end(), 5, thrust::greater<T>()).first);
-    ASSERT_EQUAL_QUIET(vec.begin() + 2, thrust::equal_range(vec.begin(), vec.end(), 6, thrust::greater<T>()).first);
-    ASSERT_EQUAL_QUIET(vec.begin() + 1, thrust::equal_range(vec.begin(), vec.end(), 7, thrust::greater<T>()).first);
-    ASSERT_EQUAL_QUIET(vec.begin() + 0, thrust::equal_range(vec.begin(), vec.end(), 8, thrust::greater<T>()).first);
-    ASSERT_EQUAL_QUIET(vec.begin() + 0, thrust::equal_range(vec.begin(), vec.end(), 9, thrust::greater<T>()).first);
-
-    ASSERT_EQUAL_QUIET(vec.begin() + 5, thrust::equal_range(vec.begin(), vec.end(), 0, thrust::greater<T>()).second);
-    ASSERT_EQUAL_QUIET(vec.begin() + 4, thrust::equal_range(vec.begin(), vec.end(), 1, thrust::greater<T>()).second);
-    ASSERT_EQUAL_QUIET(vec.begin() + 4, thrust::equal_range(vec.begin(), vec.end(), 2, thrust::greater<T>()).second);
-    ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::equal_range(vec.begin(), vec.end(), 3, thrust::greater<T>()).second);
-    ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::equal_range(vec.begin(), vec.end(), 4, thrust::greater<T>()).second);
-    ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::equal_range(vec.begin(), vec.end(), 5, thrust::greater<T>()).second);
-    ASSERT_EQUAL_QUIET(vec.begin() + 2, thrust::equal_range(vec.begin(), vec.end(), 6, thrust::greater<T>()).second);
-    ASSERT_EQUAL_QUIET(vec.begin() + 2, thrust::equal_range(vec.begin(), vec.end(), 7, thrust::greater<T>()).second);
-    ASSERT_EQUAL_QUIET(vec.begin() + 1, thrust::equal_range(vec.begin(), vec.end(), 8, thrust::greater<T>()).second);
-    ASSERT_EQUAL_QUIET(vec.begin() + 0, thrust::equal_range(vec.begin(), vec.end(), 9, thrust::greater<T>()).second);
+    ASSERT_EQUAL_QUIET(vec.begin() + 4, thrust::equal_range(vec.begin(), vec.end(), T{0}, thrust::greater<T>()).first);
+    ASSERT_EQUAL_QUIET(vec.begin() + 4, thrust::equal_range(vec.begin(), vec.end(), T{1}, thrust::greater<T>()).first);
+    ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::equal_range(vec.begin(), vec.end(), T{2}, thrust::greater<T>()).first);
+    ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::equal_range(vec.begin(), vec.end(), T{3}, thrust::greater<T>()).first);
+    ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::equal_range(vec.begin(), vec.end(), T{4}, thrust::greater<T>()).first);
+    ASSERT_EQUAL_QUIET(vec.begin() + 2, thrust::equal_range(vec.begin(), vec.end(), T{5}, thrust::greater<T>()).first);
+    ASSERT_EQUAL_QUIET(vec.begin() + 2, thrust::equal_range(vec.begin(), vec.end(), T{6}, thrust::greater<T>()).first);
+    ASSERT_EQUAL_QUIET(vec.begin() + 1, thrust::equal_range(vec.begin(), vec.end(), T{7}, thrust::greater<T>()).first);
+    ASSERT_EQUAL_QUIET(vec.begin() + 0, thrust::equal_range(vec.begin(), vec.end(), T{8}, thrust::greater<T>()).first);
+    ASSERT_EQUAL_QUIET(vec.begin() + 0, thrust::equal_range(vec.begin(), vec.end(), T{9}, thrust::greater<T>()).first);
+
+    ASSERT_EQUAL_QUIET(vec.begin() + 5, thrust::equal_range(vec.begin(), vec.end(), T{0}, thrust::greater<T>()).second);
+    ASSERT_EQUAL_QUIET(vec.begin() + 4, thrust::equal_range(vec.begin(), vec.end(), T{1}, thrust::greater<T>()).second);
+    ASSERT_EQUAL_QUIET(vec.begin() + 4, thrust::equal_range(vec.begin(), vec.end(), T{2}, thrust::greater<T>()).second);
+    ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::equal_range(vec.begin(), vec.end(), T{3}, thrust::greater<T>()).second);
+    ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::equal_range(vec.begin(), vec.end(), T{4}, thrust::greater<T>()).second);
+    ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::equal_range(vec.begin(), vec.end(), T{5}, thrust::greater<T>()).second);
+    ASSERT_EQUAL_QUIET(vec.begin() + 2, thrust::equal_range(vec.begin(), vec.end(), T{6}, thrust::greater<T>()).second);
+    ASSERT_EQUAL_QUIET(vec.begin() + 2, thrust::equal_range(vec.begin(), vec.end(), T{7}, thrust::greater<T>()).second);
+    ASSERT_EQUAL_QUIET(vec.begin() + 1, thrust::equal_range(vec.begin(), vec.end(), T{8}, thrust::greater<T>()).second);
+    ASSERT_EQUAL_QUIET(vec.begin() + 0, thrust::equal_range(vec.begin(), vec.end(), T{9}, thrust::greater<T>()).second);
 }
 DECLARE_VECTOR_UNITTEST(TestScalarEqualRangeDescendingSimple);
 
diff --git a/testing/binary_search_vector.cu b/testing/binary_search_vector.cu
index d9a261c45..5e8f8358e 100644
--- a/testing/binary_search_vector.cu
+++ b/testing/binary_search_vector.cu
@@ -1,6 +1,7 @@
 #include <unittest/unittest.h>
 #include <thrust/binary_search.h>
 
+#include <thrust/detail/allocator/allocator_traits.h>
 #include <thrust/sequence.h>
 #include <thrust/sort.h>
 #include <thrust/iterator/discard_iterator.h>
@@ -16,7 +17,8 @@ template <class ExampleVector, typename NewType>
 struct vector_like
 {
     typedef typename ExampleVector::allocator_type alloc;
-    typedef typename alloc::template rebind<NewType>::other new_alloc;
+    typedef typename thrust::detail::allocator_traits<alloc> alloc_traits;
+    typedef typename alloc_traits::template rebind_alloc<NewType> new_alloc;
     typedef thrust::detail::vector_base<NewType, new_alloc> type;
 };
 
diff --git a/testing/binary_search_vector_descending.cu b/testing/binary_search_vector_descending.cu
index 88ec5a3e3..edc70663a 100644
--- a/testing/binary_search_vector_descending.cu
+++ b/testing/binary_search_vector_descending.cu
@@ -2,6 +2,7 @@
 #include <thrust/binary_search.h>
 #include <thrust/functional.h>
 
+#include <thrust/detail/allocator/allocator_traits.h>
 #include <thrust/sequence.h>
 #include <thrust/sort.h>
 
@@ -14,7 +15,8 @@ template <class ExampleVector, typename NewType>
 struct vector_like
 {
     typedef typename ExampleVector::allocator_type alloc;
-    typedef typename alloc::template rebind<NewType>::other new_alloc;
+    typedef typename thrust::detail::allocator_traits<alloc> alloc_traits;
+    typedef typename alloc_traits::template rebind_alloc<NewType> new_alloc;
     typedef thrust::detail::vector_base<NewType, new_alloc> type;
 };
 
diff --git a/testing/caching_allocator.cu b/testing/caching_allocator.cu
new file mode 100644
index 000000000..f98ea336b
--- /dev/null
+++ b/testing/caching_allocator.cu
@@ -0,0 +1,23 @@
+#include <unittest/unittest.h>
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/caching_allocator.h>
+
+template<typename Allocator>
+void test_implementation(Allocator alloc)
+{
+    typedef typename thrust::detail::allocator_traits<Allocator> Traits;
+    typedef typename Allocator::pointer Ptr;
+
+    Ptr p = Traits::allocate(alloc, 123);
+    Traits::deallocate(alloc, p, 123);
+
+    Ptr p2 = Traits::allocate(alloc, 123);
+    ASSERT_EQUAL(p, p2);
+}
+
+void TestSingleDeviceTLSCachingAllocator()
+{
+    test_implementation(thrust::detail::single_device_tls_caching_allocator());
+};
+DECLARE_UNITTEST(TestSingleDeviceTLSCachingAllocator);
diff --git a/testing/cmake/CMakeLists.txt b/testing/cmake/CMakeLists.txt
new file mode 100644
index 000000000..71798de75
--- /dev/null
+++ b/testing/cmake/CMakeLists.txt
@@ -0,0 +1,37 @@
+thrust_update_system_found_flags()
+
+set(extra_cmake_flags)
+
+# Need to pass these when testing NVC++.
+if ("NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}")
+  set(extra_cmake_flags
+    -D "CMAKE_CUDA_COMPILER_ID=${CMAKE_CUDA_COMPILER_ID}"
+    -D "CMAKE_CUDA_COMPILER_FORCED=${CMAKE_CUDA_COMPILER_FORCED}"
+  )
+endif()
+
+if (THRUST_CPP_FOUND AND THRUST_CUDA_FOUND)
+  # Test that we can use `find_package` on an installed Thrust:
+  add_test(
+    NAME thrust.test.cmake.test_install
+    COMMAND "${CMAKE_COMMAND}"
+      --log-level=VERBOSE
+      -G "${CMAKE_GENERATOR}"
+      -S "${CMAKE_CURRENT_SOURCE_DIR}/test_install"
+      -B "${CMAKE_CURRENT_BINARY_DIR}/test_install"
+      -D "THRUST_BINARY_DIR=${Thrust_BINARY_DIR}"
+      -D "CMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}"
+      -D "CMAKE_CUDA_COMPILER=${CMAKE_CUDA_COMPILER}"
+      -D "CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}"
+      ${extra_cmake_flags}
+  )
+endif()
+
+# Check source code for issues that can be found by pattern matching:
+add_test(
+  NAME thrust.test.cmake.check_source_files
+  COMMAND
+    "${CMAKE_COMMAND}"
+      -D "Thrust_SOURCE_DIR=${Thrust_SOURCE_DIR}"
+      -P "${CMAKE_CURRENT_LIST_DIR}/check_source_files.cmake"
+)
diff --git a/testing/cmake/check_source_files.cmake b/testing/cmake/check_source_files.cmake
new file mode 100644
index 000000000..900300c67
--- /dev/null
+++ b/testing/cmake/check_source_files.cmake
@@ -0,0 +1,185 @@
+# Check all source files for various issues that can be detected using pattern
+# matching.
+#
+# This is run as a ctest test named `thrust.test.cmake.check_source_files`, or
+# manually with:
+# cmake -D "Thrust_SOURCE_DIR=<thrust project root>" -P check_source_files.cmake
+
+cmake_minimum_required(VERSION 3.15)
+
+function(count_substrings input search_regex output_var)
+  string(REGEX MATCHALL "${search_regex}" matches "${input}")
+  list(LENGTH matches num_matches)
+  set(${output_var} ${num_matches} PARENT_SCOPE)
+endfunction()
+
+set(found_errors 0)
+file(GLOB_RECURSE thrust_srcs
+  RELATIVE "${Thrust_SOURCE_DIR}"
+  "${Thrust_SOURCE_DIR}/thrust/*.h"
+  "${Thrust_SOURCE_DIR}/thrust/*.inl"
+)
+
+################################################################################
+# Namespace checks.
+# Check all files in thrust to make sure that they use
+# THRUST_NAMESPACE_BEGIN/END instead of bare `namespace thrust {}` declarations.
+set(namespace_exclusions
+  # This defines the macros and must have bare namespace declarations:
+  thrust/detail/config/namespace.h
+)
+
+set(bare_ns_regex "namespace[ \n\r\t]+thrust[ \n\r\t]*\\{")
+
+# Validation check for the above regex:
+count_substrings([=[
+namespace thrust{
+namespace thrust {
+namespace  thrust  {
+ namespace thrust {
+namespace thrust
+{
+namespace
+thrust
+{
+]=]
+  ${bare_ns_regex} valid_count)
+if (NOT valid_count EQUAL 6)
+  message(FATAL_ERROR "Validation of bare namespace regex failed: "
+                      "Matched ${valid_count} times, expected 6.")
+endif()
+
+################################################################################
+# stdpar header checks.
+# Check all files in Thrust to make sure that they aren't including <algorithm>
+# or <memory>, both of which will cause circular dependencies in nvc++'s
+# stdpar library.
+#
+# The headers following headers should be used instead:
+# <algorithm> -> <thrust/detail/algorithm_wrapper.h>
+# <memory>    -> <thrust/detail/memory_wrapper.h>
+#
+set(stdpar_header_exclusions
+  # The wrappers are allowed to include the unwrapped headers
+  thrust/detail/algorithm_wrapper.h
+  thrust/detail/memory_wrapper.h
+  thrust/detail/numeric_wrapper.h
+)
+
+set(algorithm_regex "#[ \t]*include[ \t]+<algorithm>")
+set(memory_regex    "#[ \t]*include[ \t]+<memory>")
+set(numeric_regex   "#[ \t]*include[ \t]+<numeric>")
+
+# Validation check for the above regex pattern:
+count_substrings([=[
+#include <algorithm>
+# include <algorithm>
+#include  <algorithm>
+# include  <algorithm>
+# include  <algorithm> // ...
+]=]
+  ${algorithm_regex} valid_count)
+if (NOT valid_count EQUAL 5)
+  message(FATAL_ERROR "Validation of stdpar header regex failed: "
+    "Matched ${valid_count} times, expected 5.")
+endif()
+
+################################################################################
+# Legacy macro checks.
+# Check all files in Thrust to make sure that they aren't using the legacy
+# CUB_RUNTIME_ENABLED and __THRUST_HAS_CUDART__ macros.
+#
+# These macros depend on __CUDA_ARCH__ and are not compatible with NV_IF_TARGET.
+# They are provided for legacy purposes and should be replaced with
+# [THRUST|CUB]_RDC_ENABLED and NV_IF_TARGET in Thrust/CUB code.
+#
+#
+set(legacy_macro_header_exclusions
+  # This header defines a legacy CUDART macro:
+  thrust/system/cuda/config.h
+)
+
+set(cub_legacy_macro_regex "CUB_RUNTIME_ENABLED")
+set(thrust_legacy_macro_regex "__THRUST_HAS_CUDART__")
+
+################################################################################
+# Read source files:
+foreach(src ${thrust_srcs})
+  file(READ "${Thrust_SOURCE_DIR}/${src}" src_contents)
+
+  if (NOT ${src} IN_LIST namespace_exclusions)
+    count_substrings("${src_contents}" "${bare_ns_regex}" bare_ns_count)
+    count_substrings("${src_contents}" THRUST_NS_PREFIX prefix_count)
+    count_substrings("${src_contents}" THRUST_NS_POSTFIX postfix_count)
+    count_substrings("${src_contents}" THRUST_NAMESPACE_BEGIN begin_count)
+    count_substrings("${src_contents}" THRUST_NAMESPACE_END end_count)
+    count_substrings("${src_contents}" "#include <thrust/detail/config.h>" header_count)
+
+    if (NOT bare_ns_count EQUAL 0)
+      message("'${src}' contains 'namespace thrust {...}'. Replace with THRUST_NAMESPACE macros.")
+      set(found_errors 1)
+    endif()
+
+    if (NOT prefix_count EQUAL 0)
+      message("'${src}' contains 'THRUST_NS_PREFIX'. Replace with THRUST_NAMESPACE macros.")
+      set(found_errors 1)
+    endif()
+
+    if (NOT postfix_count EQUAL 0)
+      message("'${src}' contains 'THRUST_NS_POSTFIX'. Replace with THRUST_NAMESPACE macros.")
+      set(found_errors 1)
+    endif()
+
+    if (NOT begin_count EQUAL end_count)
+      message("'${src}' namespace macros are unbalanced:")
+      message(" - THRUST_NAMESPACE_BEGIN occurs ${begin_count} times.")
+      message(" - THRUST_NAMESPACE_END   occurs ${end_count} times.")
+      set(found_errors 1)
+    endif()
+
+    if (begin_count GREATER 0 AND header_count EQUAL 0)
+      message("'${src}' uses Thrust namespace macros, but does not (directly) `#include <thrust/detail/config.h>`.")
+      set(found_errors 1)
+    endif()
+  endif()
+
+  if (NOT ${src} IN_LIST stdpar_header_exclusions)
+    count_substrings("${src_contents}" "${algorithm_regex}" algorithm_count)
+    count_substrings("${src_contents}" "${memory_regex}" memory_count)
+    count_substrings("${src_contents}" "${numeric_regex}" numeric_count)
+
+    if (NOT algorithm_count EQUAL 0)
+      message("'${src}' includes the <algorithm> header. Replace with <thrust/detail/algorithm_wrapper.h>.")
+      set(found_errors 1)
+    endif()
+
+    if (NOT memory_count EQUAL 0)
+      message("'${src}' includes the <memory> header. Replace with <thrust/detail/memory_wrapper.h>.")
+      set(found_errors 1)
+    endif()
+
+    if (NOT numeric_count EQUAL 0)
+      message("'${src}' includes the <numeric> header. Replace with <thrust/detail/numeric_wrapper.h>.")
+      set(found_errors 1)
+    endif()
+  endif()
+
+  if (NOT ${src} IN_LIST legacy_macro_header_exclusions)
+    count_substrings("${src_contents}" "${thrust_legacy_macro_regex}" thrust_count)
+    count_substrings("${src_contents}" "${cub_legacy_macro_regex}" cub_count)
+
+    if (NOT thrust_count EQUAL 0)
+      message("'${src}' uses __THRUST_HAS_CUDART__. Replace with THRUST_RDC_ENABLED and NV_IF_TARGET.")
+      set(found_errors 1)
+    endif()
+
+    if (NOT cub_count EQUAL 0)
+      message("'${src}' uses CUB_RUNTIME_ENABLED. Replace with CUB_RDC_ENABLED and NV_IF_TARGET.")
+      set(found_errors 1)
+    endif()
+  endif()
+endforeach()
+
+if (NOT found_errors EQUAL 0)
+  message(FATAL_ERROR "Errors detected.")
+endif()
diff --git a/testing/cmake/test_install/CMakeLists.txt b/testing/cmake/test_install/CMakeLists.txt
new file mode 100644
index 000000000..30cf8405c
--- /dev/null
+++ b/testing/cmake/test_install/CMakeLists.txt
@@ -0,0 +1,110 @@
+# Test that an installation of the project can be located by find_package() call
+# with appropriate prefix settings.
+#
+# Expects THRUST_BINARY_DIR to be set to an existing thrust build directory.
+
+cmake_minimum_required(VERSION 3.15)
+
+project(ThrustTestInstall CXX CUDA)
+
+# This will eventually get deleted recursively -- keep that in mind if modifying
+set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/install_prefix/")
+
+function(do_manual_install)
+  # Inspired by the VTK-m install tests, we can just glob up all of the
+  # cmake_install.cmake, include (ie. run) them, and they'll effectively
+  # install the project into the current value of CMAKE_INSTALL_PREFIX.
+
+  # Gather all of the install files from Thrust's root:
+  file(GLOB_RECURSE install_files
+    LIST_DIRECTORIES False
+    "${THRUST_BINARY_DIR}/cmake_install.cmake"
+  )
+
+  message(STATUS "Locating install files...")
+  foreach (install_file IN LISTS install_files)
+    message(STATUS "  * ${install_file}")
+  endforeach()
+
+  message(STATUS "Building install tree...")
+  foreach(install_file IN LISTS install_files)
+    include("${install_file}")
+  endforeach()
+endfunction()
+
+function(do_cleanup)
+  message(STATUS "Removing ${CMAKE_INSTALL_PREFIX}")
+  file(REMOVE_RECURSE "${CMAKE_INSTALL_PREFIX}")
+endfunction()
+
+function(assert_boolean var_name expect)
+  if (expect)
+    if (NOT ${var_name})
+      message(FATAL_ERROR "'${var_name}' is false, expected true.")
+    endif()
+  else()
+    if (${var_name})
+      message(FATAL_ERROR "'${var_name}' is true, expected false.")
+    endif()
+  endif()
+endfunction()
+
+function(assert_target target_name)
+  if (NOT TARGET "${target_name}")
+    message(FATAL_ERROR "Target '${target_name}' not defined.")
+  endif()
+endfunction()
+
+function(find_installed_project)
+  set(CMAKE_PREFIX_PATH "${CMAKE_INSTALL_PREFIX}")
+  find_package(Thrust CONFIG COMPONENTS CPP CUDA)
+
+  if (NOT Thrust_FOUND)
+    message(FATAL_ERROR
+      "find_package(Thrust) failed. "
+      "CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}"
+    )
+  endif()
+
+  # Test some internal config vars to check that this is the expected install:
+  # TODO The cmake_path (3.19) command will provide more robust ways to do this
+
+  # Escape regex special characters in the install prefix, see
+  # https://gitlab.kitware.com/cmake/cmake/-/issues/18580
+  string(REGEX REPLACE "([][+.*()^])" "\\\\\\1"
+    prefix_regex
+    "${CMAKE_INSTALL_PREFIX}"
+  )
+  if (NOT _THRUST_INCLUDE_DIR MATCHES "^${prefix_regex}")
+    message(FATAL_ERROR
+      "Found Thrust in unexpected location: "
+      " * _THRUST_INCLUDE_DIR=${_THRUST_INCLUDE_DIR} "
+      " * ExpectedPrefix=${CMAKE_INSTALL_DIR}"
+    )
+  endif()
+  if (NOT _CUB_INCLUDE_DIR MATCHES "^${prefix_regex}")
+    message(FATAL_ERROR
+      "Found CUB in unexpected location: "
+      " * _CUB_INCLUDE_DIR=${_CUB_INCLUDE_DIR} "
+      " * ExpectedPrefix=${CMAKE_INSTALL_DIR}"
+    )
+  endif()
+
+  thrust_create_target(Thrust)
+  assert_target(Thrust)
+  assert_target(CUB::CUB)
+  assert_target(Thrust::CPP::Host)
+  assert_target(Thrust::CUDA::Device)
+
+  thrust_update_system_found_flags()
+  assert_boolean(THRUST_CPP_FOUND TRUE)
+  assert_boolean(THRUST_CUDA_FOUND TRUE)
+  assert_boolean(THRUST_OMP_FOUND FALSE)
+  assert_boolean(THRUST_TBB_FOUND FALSE)
+
+endfunction()
+
+do_cleanup() # Prepare for new installation
+do_manual_install()
+find_installed_project()
+do_cleanup() # Clean up if successful
diff --git a/testing/complex.cu b/testing/complex.cu
index e69f2e7cd..cf980962a 100644
--- a/testing/complex.cu
+++ b/testing/complex.cu
@@ -1,6 +1,8 @@
 #include <unittest/unittest.h>
 
 #include <thrust/complex.h>
+#include <thrust/detail/config.h>
+
 #include <complex>
 #include <iostream>
 #include <sstream>
@@ -273,7 +275,7 @@ struct TestComplexTrigonometricFunctions
     ASSERT_ALMOST_EQUAL(sinh(a),sinh(c));
     ASSERT_ALMOST_EQUAL(tanh(a),tanh(c));
 
-#if __cplusplus >= 201103L
+#if THRUST_CPP_DIALECT >= 2011
 
     ASSERT_ALMOST_EQUAL(acos(a),acos(c));
     ASSERT_ALMOST_EQUAL(asin(a),asin(c));
diff --git a/testing/complex_transform.cu b/testing/complex_transform.cu
index c4496aad6..439597a0d 100644
--- a/testing/complex_transform.cu
+++ b/testing/complex_transform.cu
@@ -235,15 +235,6 @@ struct TestComplexArithmeticTransform
     thrust::device_vector<type> d_p1 = h_p1;
     thrust::device_vector<type> d_p2 = h_p2;
     thrust::device_vector<type> d_result(n);
-#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
-    if(const CUDATestDriver *driver = dynamic_cast<const CUDATestDriver*>(&UnitTestDriver::s_driver()))
-    {
-      if(sizeof(T) == sizeof(double) && driver->current_device_architecture() < 200)
-      {
-        KNOWN_FAILURE;
-      } // end if
-    } // end if
-#endif
 
     thrust::transform(h_p1.begin(), h_p1.end(), h_p2.begin(), h_result.begin(), basic_arithmetic_functor());
     thrust::transform(d_p1.begin(), d_p1.end(), d_p2.begin(), d_result.begin(), basic_arithmetic_functor());    
@@ -264,16 +255,6 @@ struct TestComplexPlaneTransform
     thrust::device_vector<type> d_p1 = h_p1;
     thrust::device_vector<type> d_result(n);
 
-#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
-    if(const CUDATestDriver *driver = dynamic_cast<const CUDATestDriver*>(&UnitTestDriver::s_driver()))
-    {
-      if(sizeof(T) == sizeof(double) && driver->current_device_architecture() < 200)
-      {
-        KNOWN_FAILURE;
-      } // end if
-    } // end if
-#endif
-
     thrust::transform(h_p1.begin(), h_p1.end(), h_result.begin(), complex_plane_functor());
     thrust::transform(d_p1.begin(), d_p1.end(), d_result.begin(), complex_plane_functor());    
     ASSERT_ALMOST_EQUAL(h_result, d_result);
@@ -296,16 +277,6 @@ struct TestComplexPowerTransform
     thrust::device_vector<type> d_p2 = h_p2;
     thrust::device_vector<type> d_result(n);
 
-#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
-    if(const CUDATestDriver *driver = dynamic_cast<const CUDATestDriver*>(&UnitTestDriver::s_driver()))
-    {
-      if(sizeof(T) == sizeof(double) && driver->current_device_architecture() < 200)
-      {
-        KNOWN_FAILURE;
-      } // end if
-    } // end if
-#endif
-
     thrust::transform(h_p1.begin(), h_p1.end(), h_p2.begin(), h_result.begin(), pow_functor());
     thrust::transform(d_p1.begin(), d_p1.end(), d_p2.begin(), d_result.begin(), pow_functor());    
     // pow can be very innacurate there's no point trying to check for equality
@@ -331,16 +302,6 @@ struct TestComplexExponentialTransform
     thrust::device_vector<type> d_p1 = h_p1;
     thrust::device_vector<type> d_result(n);
 
-#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
-    if(const CUDATestDriver *driver = dynamic_cast<const CUDATestDriver*>(&UnitTestDriver::s_driver()))
-    {
-      if(sizeof(T) == sizeof(double) && driver->current_device_architecture() < 200)
-      {
-        KNOWN_FAILURE;
-      } // end if
-    } // end if
-#endif
-
     thrust::transform(h_p1.begin(), h_p1.end(), h_result.begin(), exp_functor());
     thrust::transform(d_p1.begin(), d_p1.end(), d_result.begin(), exp_functor());    
     ASSERT_ALMOST_EQUAL(h_result, d_result);
@@ -368,15 +329,6 @@ struct TestComplexTrigonometricTransform
     thrust::device_vector<type> d_p1 = h_p1;
     thrust::device_vector<type> d_result(n);
 
-#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
-    if(const CUDATestDriver *driver = dynamic_cast<const CUDATestDriver*>(&UnitTestDriver::s_driver()))
-    {
-      if(sizeof(T) == sizeof(double) && driver->current_device_architecture() < 200)
-      {
-        KNOWN_FAILURE;
-      } // end if
-    } // end if
-#endif
 
     thrust::transform(h_p1.begin(), h_p1.end(), h_result.begin(), sin_functor());
     thrust::transform(d_p1.begin(), d_p1.end(), d_result.begin(), sin_functor());    
@@ -404,7 +356,6 @@ struct TestComplexTrigonometricTransform
     ASSERT_ALMOST_EQUAL(h_result, d_result);
 
 
-
     thrust::transform(h_p1.begin(), h_p1.end(), h_result.begin(), asin_functor());
     thrust::transform(d_p1.begin(), d_p1.end(), d_result.begin(), asin_functor());    
     ASSERT_ALMOST_EQUAL(h_result, d_result);
diff --git a/testing/constant_iterator.cu b/testing/constant_iterator.cu
index cbf771c9a..e42cfea8d 100644
--- a/testing/constant_iterator.cu
+++ b/testing/constant_iterator.cu
@@ -109,11 +109,12 @@ void TestConstantIteratorCopy(void)
 {
   using namespace thrust;
 
-  typedef constant_iterator<int> ConstIter;
+  using ValueType = typename Vector::value_type;
+  using ConstIter = constant_iterator<ValueType>;
 
   Vector result(4);
 
-  ConstIter first = make_constant_iterator<int>(7);
+  ConstIter first = make_constant_iterator<ValueType>(7);
   ConstIter last  = first + result.size();
   thrust::copy(first, last, result.begin());
 
diff --git a/testing/copy.cu b/testing/copy.cu
index 342788acf..661e379a2 100644
--- a/testing/copy.cu
+++ b/testing/copy.cu
@@ -1,14 +1,19 @@
 #include <unittest/unittest.h>
 #include <thrust/copy.h>
 
+#include <array>
+#include <algorithm>
 #include <list>
 #include <iterator>
+#include <thrust/detail/config.h>
 #include <thrust/sequence.h>
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/discard_iterator.h>
 #include <thrust/iterator/retag.h>
+#include <thrust/device_malloc.h>
+#include <thrust/device_free.h>
 
 void TestCopyFromConstIterator(void)
 {
@@ -336,9 +341,6 @@ void TestCopyIfSequence(const size_t n)
     thrust::host_vector<T>   h_data(n); thrust::sequence(h_data.begin(), h_data.end());
     thrust::device_vector<T> d_data(n); thrust::sequence(d_data.begin(), d_data.end());
 
-    thrust::host_vector<T>   h_result(n);
-    thrust::device_vector<T> d_result(n);
-
     typename thrust::host_vector<T>::iterator   h_new_end;
     typename thrust::device_vector<T>::iterator d_new_end;
 
@@ -405,9 +407,6 @@ void TestCopyIfStencil(const size_t n)
     thrust::host_vector<T>   h_stencil = unittest::random_integers<T>(n);
     thrust::device_vector<T> d_stencil = unittest::random_integers<T>(n);
 
-    thrust::host_vector<T>   h_result(n);
-    thrust::device_vector<T> d_result(n);
-
     typename thrust::host_vector<T>::iterator   h_new_end;
     typename thrust::device_vector<T>::iterator d_new_end;
 
@@ -427,6 +426,100 @@ void TestCopyIfStencil(const size_t n)
 }
 DECLARE_INTEGRAL_VARIABLE_UNITTEST(TestCopyIfStencil);
 
+namespace
+{
+
+struct object_with_non_trivial_ctor
+{
+  // This struct will only properly assign if its `magic` member is
+  // set to this certain number.
+  static constexpr int MAGIC = 923390;
+
+  int field;
+  int magic;
+
+  __host__ __device__ object_with_non_trivial_ctor()
+  {
+    magic = MAGIC;
+    field = 0;
+  }
+  __host__ __device__ object_with_non_trivial_ctor(int f)
+  {
+    magic = MAGIC;
+    field = f;
+  }
+
+  object_with_non_trivial_ctor(const object_with_non_trivial_ctor& x) = default;
+
+  // This non-trivial assignment requires that `this` points to initialized
+  // memory
+  __host__ __device__ object_with_non_trivial_ctor&
+  operator=(const object_with_non_trivial_ctor& x)
+  {
+    // To really copy over x's field value, require we have magic value set.
+    // If copy_if copies to uninitialized bits, the field will rarely be 923390.
+    if (magic == MAGIC)
+    {
+      field = x.field;
+    }
+    return *this;
+  }
+};
+
+struct always_true
+{
+  __host__ __device__
+  bool operator()(const object_with_non_trivial_ctor&)
+  {
+    return true;
+  };
+};
+
+} // end anon namespace
+
+void TestCopyIfNonTrivial()
+{
+  // Attempting to copy an object_with_non_trivial_ctor into uninitialized
+  // memory will fail:
+  {
+    static constexpr size_t BufferAlign = alignof(object_with_non_trivial_ctor);
+    static constexpr size_t BufferSize = sizeof(object_with_non_trivial_ctor);
+    alignas(BufferAlign) std::array<unsigned char, BufferSize> buffer;
+
+    // Fill buffer with 0s to prevent warnings about uninitialized reads while
+    // ensure that the 'magic number' mechanism works as intended:
+    std::fill(buffer.begin(), buffer.end(), 0);
+
+    object_with_non_trivial_ctor initialized;
+    object_with_non_trivial_ctor *uninitialized =
+      reinterpret_cast<object_with_non_trivial_ctor*>(buffer.data());
+
+    object_with_non_trivial_ctor source(42);
+    initialized = source;
+    *uninitialized = source;
+
+    ASSERT_EQUAL(42, initialized.field);
+    ASSERT_NOT_EQUAL(42, uninitialized->field);
+  }
+
+  // This test ensures that we use placement new instead of assigning
+  // to uninitialized memory. See Thrust Github issue #1153.
+  thrust::device_vector<object_with_non_trivial_ctor> a(10, object_with_non_trivial_ctor(99));
+  thrust::device_vector<object_with_non_trivial_ctor> b(10);
+
+  thrust::copy_if(a.begin(), a.end(), b.begin(), always_true());
+
+  for (int i = 0; i < 10; i++)
+  {
+    object_with_non_trivial_ctor ha(a[i]);
+    object_with_non_trivial_ctor hb(b[i]);
+    int ia = ha.field;
+    int ib = hb.field;
+
+    ASSERT_EQUAL(ia, ib);
+  }
+}
+DECLARE_UNITTEST(TestCopyIfNonTrivial);
 
 template <typename Vector>
 void TestCopyCountingIterator(void)
@@ -617,3 +710,72 @@ void TestCopyIfStencilDispatchImplicit()
 }
 DECLARE_UNITTEST(TestCopyIfStencilDispatchImplicit);
 
+struct only_set_when_expected_it
+{
+    long long expected;
+    bool * flag;
+
+    __host__ __device__ only_set_when_expected_it operator++() const { return *this; }
+    __host__ __device__ only_set_when_expected_it operator*() const { return *this; }
+    template<typename Difference>
+    __host__ __device__ only_set_when_expected_it operator+(Difference) const { return *this; }
+    template<typename Difference>
+    __host__ __device__ only_set_when_expected_it operator+=(Difference) const { return *this; }
+    template<typename Index>
+    __host__ __device__ only_set_when_expected_it operator[](Index) const { return *this; }
+
+    __device__
+    void operator=(long long value) const
+    {
+        if (value == expected)
+        {
+            *flag = true;
+        }
+    }
+};
+
+THRUST_NAMESPACE_BEGIN
+namespace detail
+{
+// We need this type to pass as a non-const ref for unary_transform_functor
+// to compile:
+template <>
+struct is_non_const_reference<only_set_when_expected_it> : thrust::true_type {};
+} // end namespace detail
+
+template<>
+struct iterator_traits<only_set_when_expected_it>
+{
+    typedef long long value_type;
+    typedef only_set_when_expected_it reference;
+    typedef thrust::random_access_device_iterator_tag iterator_category;
+};
+THRUST_NAMESPACE_END
+
+void TestCopyWithBigIndexesHelper(int magnitude)
+{
+    thrust::counting_iterator<long long> begin(0);
+    thrust::counting_iterator<long long> end = begin + (1ll << magnitude);
+    ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude);
+
+    thrust::device_ptr<bool> has_executed = thrust::device_malloc<bool>(1);
+    *has_executed = false;
+
+    only_set_when_expected_it out = { (1ll << magnitude) - 1, thrust::raw_pointer_cast(has_executed) };
+
+    thrust::copy(thrust::device, begin, end, out);
+
+    bool has_executed_h = *has_executed;
+    thrust::device_free(has_executed);
+
+    ASSERT_EQUAL(has_executed_h, true);
+}
+
+void TestCopyWithBigIndexes()
+{
+    TestCopyWithBigIndexesHelper(30);
+    TestCopyWithBigIndexesHelper(31);
+    TestCopyWithBigIndexesHelper(32);
+    TestCopyWithBigIndexesHelper(33);
+}
+DECLARE_UNITTEST(TestCopyWithBigIndexes);
diff --git a/testing/count.cu b/testing/count.cu
index 10c951c47..a6021da79 100644
--- a/testing/count.cu
+++ b/testing/count.cu
@@ -116,3 +116,22 @@ void TestCountDispatchImplicit()
 }
 DECLARE_UNITTEST(TestCountDispatchImplicit);
 
+void TestCountWithBigIndexesHelper(int magnitude)
+{
+    thrust::counting_iterator<long long> begin(1);
+    thrust::counting_iterator<long long> end = begin + (1ll << magnitude);
+    ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude);
+
+    long long result = thrust::count(thrust::device, begin, end, (1ll << magnitude) - 17);
+
+    ASSERT_EQUAL(result, 1);
+}
+
+void TestCountWithBigIndexes()
+{
+    TestCountWithBigIndexesHelper(30);
+    TestCountWithBigIndexesHelper(31);
+    TestCountWithBigIndexesHelper(32);
+    TestCountWithBigIndexesHelper(33);
+}
+DECLARE_UNITTEST(TestCountWithBigIndexes);
diff --git a/testing/counting_iterator.cu b/testing/counting_iterator.cu
index eede510fc..ebefe4d64 100644
--- a/testing/counting_iterator.cu
+++ b/testing/counting_iterator.cu
@@ -8,6 +8,14 @@
 
 THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_BEGIN
 
+template <typename T>
+void TestCountingDefaultConstructor(void)
+{
+  thrust::counting_iterator<T> iter0;
+  ASSERT_EQUAL(*iter0, T{});
+}
+DECLARE_GENERIC_UNITTEST(TestCountingDefaultConstructor);
+
 void TestCountingIteratorCopyConstructor(void)
 {
     thrust::counting_iterator<int> iter0(100);
diff --git a/testing/cpp/CMakeLists.txt b/testing/cpp/CMakeLists.txt
new file mode 100644
index 000000000..215b81ee4
--- /dev/null
+++ b/testing/cpp/CMakeLists.txt
@@ -0,0 +1,18 @@
+file(GLOB test_srcs
+  RELATIVE "${CMAKE_CURRENT_LIST_DIR}}"
+  CONFIGURE_DEPENDS
+  *.cu *.cpp
+)
+
+foreach(thrust_target IN LISTS THRUST_TARGETS)
+  thrust_get_target_property(config_device ${thrust_target} DEVICE)
+  if (NOT config_device STREQUAL "CPP")
+    continue()
+  endif()
+
+  foreach(test_src IN LISTS test_srcs)
+    get_filename_component(test_name "${test_src}" NAME_WLE)
+    string(PREPEND test_name "cpp.")
+    thrust_add_test(test_target ${test_name} "${test_src}" ${thrust_target})
+  endforeach()
+endforeach()
diff --git a/testing/cpp/adjacent_difference.cu b/testing/cpp/adjacent_difference.cu
new file mode 100644
index 000000000..584899bec
--- /dev/null
+++ b/testing/cpp/adjacent_difference.cu
@@ -0,0 +1,54 @@
+#include <unittest/unittest.h>
+#include <thrust/adjacent_difference.h>
+#include <thrust/execution_policy.h>
+#include <thrust/device_malloc.h>
+#include <thrust/device_free.h>
+
+struct detect_wrong_difference
+{
+    bool * flag;
+
+    __host__ __device__ detect_wrong_difference operator++() const { return *this; }
+    __host__ __device__ detect_wrong_difference operator*() const { return *this; }
+    template<typename Difference>
+    __host__ __device__ detect_wrong_difference operator+(Difference) const { return *this; }
+    template<typename Index>
+    __host__ __device__ detect_wrong_difference operator[](Index) const { return *this; }
+
+    __device__
+    void operator=(long long difference) const
+    {
+        if (difference != 1)
+        {
+            *flag = false;
+        }
+    }
+};
+
+void TestAdjacentDifferenceWithBigIndexesHelper(int magnitude)
+{
+    thrust::counting_iterator<long long> begin(1);
+    thrust::counting_iterator<long long> end = begin + (1ll << magnitude);
+    ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude);
+
+    thrust::device_ptr<bool> all_differences_correct = thrust::device_malloc<bool>(1);
+    *all_differences_correct = true;
+
+    detect_wrong_difference out = { thrust::raw_pointer_cast(all_differences_correct) };
+
+    thrust::adjacent_difference(thrust::device, begin, end, out);
+
+    bool all_differences_correct_h = *all_differences_correct;
+    thrust::device_free(all_differences_correct);
+
+    ASSERT_EQUAL(all_differences_correct_h, true);
+}
+
+void TestAdjacentDifferenceWithBigIndexes()
+{
+    TestAdjacentDifferenceWithBigIndexesHelper(30);
+    TestAdjacentDifferenceWithBigIndexesHelper(31);
+    TestAdjacentDifferenceWithBigIndexesHelper(32);
+    TestAdjacentDifferenceWithBigIndexesHelper(33);
+}
+DECLARE_UNITTEST(TestAdjacentDifferenceWithBigIndexes);
diff --git a/testing/cuda/CMakeLists.txt b/testing/cuda/CMakeLists.txt
new file mode 100644
index 000000000..8fe4a4be7
--- /dev/null
+++ b/testing/cuda/CMakeLists.txt
@@ -0,0 +1,35 @@
+file(GLOB test_srcs
+  RELATIVE "${CMAKE_CURRENT_LIST_DIR}}"
+  CONFIGURE_DEPENDS
+  *.cu *.cpp
+)
+
+# These tests always build with RDC, so make sure that the sm_XX flags are
+# compatible. See note in ThrustCudaConfig.cmake.
+# TODO once we're using CUDA_ARCHITECTURES, we can setup non-rdc fallback
+# tests to build for non-rdc arches. But for now, all files in a given directory
+# must build with the same `CMAKE_CUDA_FLAGS` due to CMake constraints around
+# how CUDA_FLAGS works.
+set(CMAKE_CUDA_FLAGS "${THRUST_CUDA_FLAGS_BASE} ${THRUST_CUDA_FLAGS_RDC}")
+
+foreach(thrust_target IN LISTS THRUST_TARGETS)
+  thrust_get_target_property(config_device ${thrust_target} DEVICE)
+  if (NOT config_device STREQUAL "CUDA")
+    continue()
+  endif()
+
+  foreach(test_src IN LISTS test_srcs)
+    get_filename_component(test_name "${test_src}" NAME_WLE)
+    string(PREPEND test_name "cuda.")
+
+    # Create two targets, one with RDC enabled, the other without. This tests
+    # both device-side behaviors -- the CDP kernel launch with RDC, and the
+    # serial fallback path without RDC.
+    thrust_add_test(seq_test_target ${test_name}.cdp_0 "${test_src}" ${thrust_target})
+
+    if (THRUST_ENABLE_TESTS_WITH_RDC)
+      thrust_add_test(cdp_test_target ${test_name}.cdp_1 "${test_src}" ${thrust_target})
+      thrust_enable_rdc_for_cuda_target(${cdp_test_target})
+    endif()
+  endforeach()
+endforeach()
diff --git a/testing/cuda/adjacent_difference.cu b/testing/cuda/adjacent_difference.cu
index 1e0b5a784..9b101ea2e 100644
--- a/testing/cuda/adjacent_difference.cu
+++ b/testing/cuda/adjacent_difference.cu
@@ -1,8 +1,11 @@
 #include <unittest/unittest.h>
 #include <thrust/adjacent_difference.h>
 #include <thrust/execution_policy.h>
+#include <thrust/device_malloc.h>
+#include <thrust/device_free.h>
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2>
 __global__ void adjacent_difference_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 result)
 {
@@ -22,28 +25,28 @@ void TestAdjacentDifferenceDevice(ExecutionPolicy exec, const size_t n)
 {
   thrust::host_vector<T>   h_input = unittest::random_samples<T>(n);
   thrust::device_vector<T> d_input = h_input;
-  
+
   thrust::host_vector<T>   h_output(n);
   thrust::device_vector<T> d_output(n);
-  
+
   thrust::adjacent_difference(h_input.begin(), h_input.end(), h_output.begin());
   adjacent_difference_kernel<<<1,1>>>(exec, d_input.begin(), d_input.end(), d_output.begin());
   {
     cudaError_t const err = cudaDeviceSynchronize();
     ASSERT_EQUAL(cudaSuccess, err);
   }
-  
+
   ASSERT_EQUAL(h_output, d_output);
-  
+
   thrust::adjacent_difference(h_input.begin(), h_input.end(), h_output.begin(), thrust::plus<T>());
   adjacent_difference_kernel<<<1,1>>>(exec, d_input.begin(), d_input.end(), d_output.begin(), thrust::plus<T>());
   {
     cudaError_t const err = cudaDeviceSynchronize();
     ASSERT_EQUAL(cudaSuccess, err);
   }
-  
+
   ASSERT_EQUAL(h_output, d_output);
-  
+
   // in-place operation
   thrust::adjacent_difference(h_input.begin(), h_input.end(), h_input.begin(), thrust::plus<T>());
   adjacent_difference_kernel<<<1,1>>>(exec, d_input.begin(), d_input.end(), d_input.begin(), thrust::plus<T>());
@@ -51,7 +54,7 @@ void TestAdjacentDifferenceDevice(ExecutionPolicy exec, const size_t n)
     cudaError_t const err = cudaDeviceSynchronize();
     ASSERT_EQUAL(cudaSuccess, err);
   }
-  
+
   ASSERT_EQUAL(h_input, h_output); //computed previously
   ASSERT_EQUAL(d_input, d_output); //computed previously
 }
@@ -71,21 +74,22 @@ void TestAdjacentDifferenceDeviceDevice(const size_t n)
   TestAdjacentDifferenceDevice<T>(thrust::device, n);
 }
 DECLARE_VARIABLE_UNITTEST(TestAdjacentDifferenceDeviceDevice);
+#endif
 
 
 void TestAdjacentDifferenceCudaStreams()
 {
   cudaStream_t s;
   cudaStreamCreate(&s);
-  
+
   thrust::device_vector<int> input(3);
   thrust::device_vector<int> output(3);
   input[0] = 1; input[1] = 4; input[2] = 6;
-  
+
   thrust::adjacent_difference(thrust::cuda::par.on(s), input.begin(), input.end(), output.begin());
 
   cudaStreamSynchronize(s);
-  
+
   ASSERT_EQUAL(output[0], 1);
   ASSERT_EQUAL(output[1], 3);
   ASSERT_EQUAL(output[2], 2);
@@ -94,3 +98,57 @@ void TestAdjacentDifferenceCudaStreams()
 }
 DECLARE_UNITTEST(TestAdjacentDifferenceCudaStreams);
 
+struct detect_wrong_difference
+{
+    using difference_type = void;
+    using value_type = void;
+    using pointer = void;
+    using reference = void;
+    using iterator_category = std::output_iterator_tag;
+
+    bool * flag;
+
+    __host__ __device__ detect_wrong_difference operator++() const { return *this; }
+    __host__ __device__ detect_wrong_difference operator*() const { return *this; }
+    template<typename Difference>
+    __host__ __device__ detect_wrong_difference operator+(Difference) const { return *this; }
+    template<typename Index>
+    __host__ __device__ detect_wrong_difference operator[](Index) const { return *this; }
+
+    __device__
+    void operator=(long long difference) const
+    {
+        if (difference != 1)
+        {
+            *flag = false;
+        }
+    }
+};
+
+void TestAdjacentDifferenceWithBigIndexesHelper(int magnitude)
+{
+    thrust::counting_iterator<long long> begin(1);
+    thrust::counting_iterator<long long> end = begin + (1ll << magnitude);
+    ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude);
+
+    thrust::device_ptr<bool> all_differences_correct = thrust::device_malloc<bool>(1);
+    *all_differences_correct = true;
+
+    detect_wrong_difference out = { thrust::raw_pointer_cast(all_differences_correct) };
+
+    thrust::adjacent_difference(thrust::device, begin, end, out);
+
+    bool all_differences_correct_h = *all_differences_correct;
+    thrust::device_free(all_differences_correct);
+
+    ASSERT_EQUAL(all_differences_correct_h, true);
+}
+
+void TestAdjacentDifferenceWithBigIndexes()
+{
+    TestAdjacentDifferenceWithBigIndexesHelper(30);
+    TestAdjacentDifferenceWithBigIndexesHelper(31);
+    TestAdjacentDifferenceWithBigIndexesHelper(32);
+    TestAdjacentDifferenceWithBigIndexesHelper(33);
+}
+DECLARE_UNITTEST(TestAdjacentDifferenceWithBigIndexes);
diff --git a/testing/cuda/adjacent_difference.mk b/testing/cuda/adjacent_difference.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/adjacent_difference.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/cuda/binary_search.cu b/testing/cuda/binary_search.cu
new file mode 100644
index 000000000..58a83f61c
--- /dev/null
+++ b/testing/cuda/binary_search.cu
@@ -0,0 +1,25 @@
+#include <unittest/unittest.h>
+
+#include <thrust/binary_search.h>
+#include <thrust/device_vector.h>
+#include <thrust/distance.h>
+#include <thrust/pair.h>
+#include <thrust/sequence.h>
+
+void TestEqualRangeOnStream()
+{ // Regression test for GH issue #921 (nvbug 2173437)
+  typedef typename thrust::device_vector<int> vector_t;
+  typedef typename vector_t::iterator iterator_t;
+  typedef thrust::pair<iterator_t, iterator_t> result_t;
+
+  vector_t input(10);
+  thrust::sequence(thrust::device, input.begin(), input.end(), 0);
+  cudaStream_t stream = 0;
+  result_t result = thrust::equal_range(thrust::cuda::par.on(stream),
+                                        input.begin(), input.end(),
+                                        5);
+
+  ASSERT_EQUAL(5, thrust::distance(input.begin(), result.first));
+  ASSERT_EQUAL(6, thrust::distance(input.begin(), result.second));
+}
+DECLARE_UNITTEST(TestEqualRangeOnStream);
diff --git a/testing/cuda/binary_search.mk b/testing/cuda/binary_search.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/binary_search.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/cuda/complex.mk b/testing/cuda/complex.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/complex.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/cuda/copy.cu b/testing/cuda/copy.cu
index 1ad6e2626..6fe91853d 100644
--- a/testing/cuda/copy.cu
+++ b/testing/cuda/copy.cu
@@ -3,6 +3,7 @@
 #include <thrust/execution_policy.h>
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2>
 __global__
 void copy_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 result)
@@ -89,4 +90,5 @@ void TestCopyNDeviceDevice(size_t n)
   TestCopyNDevice<T>(thrust::device, n);
 }
 DECLARE_VARIABLE_UNITTEST(TestCopyNDeviceDevice);
+#endif
 
diff --git a/testing/cuda/copy.mk b/testing/cuda/copy.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/copy.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/cuda/copy_if.cu b/testing/cuda/copy_if.cu
index dcec12fde..bb879b671 100644
--- a/testing/cuda/copy_if.cu
+++ b/testing/cuda/copy_if.cu
@@ -3,7 +3,6 @@
 #include <thrust/sequence.h>
 #include <thrust/execution_policy.h>
 
-
 template<typename T>
 struct is_even
 {
@@ -20,6 +19,7 @@ struct mod_3
 };
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Predicate, typename Iterator3>
 __global__ void copy_if_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 result1, Predicate pred, Iterator3 result2)
 {
@@ -95,7 +95,15 @@ void TestCopyIfDeviceDevice()
 DECLARE_UNITTEST(TestCopyIfDeviceDevice);
 
 
-void TestCopyIfCudaStreams()
+void TestCopyIfDeviceNoSync()
+{
+  TestCopyIfDevice(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestCopyIfDeviceNoSync);
+#endif
+
+template<typename ExecutionPolicy>
+void TestCopyIfCudaStreams(ExecutionPolicy policy)
 {
   typedef thrust::device_vector<int> Vector;
 
@@ -111,7 +119,7 @@ void TestCopyIfCudaStreams()
   cudaStream_t s;
   cudaStreamCreate(&s);
 
-  Vector::iterator end = thrust::copy_if(thrust::cuda::par.on(s),
+  Vector::iterator end = thrust::copy_if(policy.on(s),
                                          data.begin(), 
                                          data.end(), 
                                          result.begin(),
@@ -124,9 +132,19 @@ void TestCopyIfCudaStreams()
 
   cudaStreamDestroy(s);
 }
-DECLARE_UNITTEST(TestCopyIfCudaStreams);
 
+void TestCopyIfCudaStreamsSync(){
+  TestCopyIfCudaStreams(thrust::cuda::par);
+}
+DECLARE_UNITTEST(TestCopyIfCudaStreamsSync);
 
+void TestCopyIfCudaStreamsNoSync(){
+  TestCopyIfCudaStreams(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestCopyIfCudaStreamsNoSync);
+
+
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3, typename Predicate, typename Iterator4>
 __global__ void copy_if_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 stencil_first, Iterator3 result1, Predicate pred, Iterator4 result2)
 {
@@ -144,9 +162,6 @@ void TestCopyIfStencilDevice(ExecutionPolicy exec)
   thrust::host_vector<int>   h_stencil = unittest::random_integers<int>(n);
   thrust::device_vector<int> d_stencil = unittest::random_integers<int>(n);
   
-  thrust::host_vector<int>   h_result(n);
-  thrust::device_vector<int> d_result(n);
-  
   typename thrust::host_vector<int>::iterator   h_new_end;
   typename thrust::device_vector<int>::iterator d_new_end;
 
@@ -208,7 +223,16 @@ void TestCopyIfStencilDeviceDevice()
 DECLARE_UNITTEST(TestCopyIfStencilDeviceDevice);
 
 
-void TestCopyIfStencilCudaStreams()
+void TestCopyIfStencilDeviceNoSync()
+{
+  TestCopyIfStencilDevice(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestCopyIfStencilDeviceNoSync);
+#endif
+
+
+template<typename ExecutionPolicy>
+void TestCopyIfStencilCudaStreams(ExecutionPolicy policy)
 {
   typedef thrust::device_vector<int> Vector;
   typedef Vector::value_type T;
@@ -232,7 +256,7 @@ void TestCopyIfStencilCudaStreams()
   cudaStream_t s;
   cudaStreamCreate(&s);
 
-  Vector::iterator end = thrust::copy_if(thrust::cuda::par.on(s),
+  Vector::iterator end = thrust::copy_if(policy.on(s),
                                          data.begin(), 
                                          data.end(),
                                          stencil.begin(),
@@ -246,5 +270,17 @@ void TestCopyIfStencilCudaStreams()
 
   cudaStreamDestroy(s);
 }
-DECLARE_UNITTEST(TestCopyIfStencilCudaStreams);
+
+void TestCopyIfStencilCudaStreamsSync()
+{
+  TestCopyIfStencilCudaStreams(thrust::cuda::par);
+}
+DECLARE_UNITTEST(TestCopyIfStencilCudaStreamsSync);
+
+
+void TestCopyIfStencilCudaStreamsNoSync()
+{
+  TestCopyIfStencilCudaStreams(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestCopyIfStencilCudaStreamsNoSync);
 
diff --git a/testing/cuda/copy_if.mk b/testing/cuda/copy_if.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/copy_if.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/cuda/count.cu b/testing/cuda/count.cu
index 32835f5c4..e2b9b5f5a 100644
--- a/testing/cuda/count.cu
+++ b/testing/cuda/count.cu
@@ -3,6 +3,7 @@
 #include <thrust/execution_policy.h>
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator, typename T, typename Iterator2>
 __global__
 void count_kernel(ExecutionPolicy exec, Iterator first, Iterator last, T value, Iterator2 result)
@@ -91,6 +92,7 @@ void TestCountIfDeviceDevice(const size_t n)
   TestCountIfDevice<T>(thrust::device, n);
 }
 DECLARE_VARIABLE_UNITTEST(TestCountIfDeviceDevice);
+#endif
 
 
 void TestCountCudaStreams()
diff --git a/testing/cuda/count.mk b/testing/cuda/count.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/count.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/cuda/cudart.mk b/testing/cuda/cudart.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/cudart.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/cuda/device_side_universal_vector.cu b/testing/cuda/device_side_universal_vector.cu
new file mode 100644
index 000000000..a31919cfc
--- /dev/null
+++ b/testing/cuda/device_side_universal_vector.cu
@@ -0,0 +1,84 @@
+#include <thrust/universal_vector.h>
+
+#include <unittest/unittest.h>
+
+template <class VecT>
+__host__ __device__ void universal_vector_access(VecT &in, thrust::universal_vector<bool> &out)
+{
+  const int expected_front  = 4;
+  const int expected_back   = 2;
+
+  out[0] = in.size() == 2 &&               //
+           in[0] == expected_front &&      //
+           in.front() == expected_front && //
+           *in.data() == expected_front && //
+           in[1] == expected_back &&       //
+           in.back() == expected_back;
+}
+
+#if defined(THRUST_TEST_DEVICE_SIDE)
+template <class VecT>
+__global__ void universal_vector_device_access_kernel(VecT &vec,
+                                                      thrust::universal_vector<bool> &out)
+{
+  universal_vector_access(vec, out);
+}
+
+template <class VecT>
+void test_universal_vector_access(VecT &vec, thrust::universal_vector<bool> &out)
+{
+  universal_vector_device_access_kernel<<<1, 1>>>(vec, out);
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+  ASSERT_EQUAL(out[0], true);
+}
+#else
+template <class VecT>
+void test_universal_vector_access(VecT &vec, thrust::universal_vector<bool> &out)
+{
+  universal_vector_access(vec, out);
+  ASSERT_EQUAL(out[0], true);
+}
+#endif
+
+void TestUniversalVectorDeviceAccess()
+{
+  thrust::universal_vector<thrust::universal_vector<int>> in_storage(1);
+  thrust::universal_vector<int> &in = *thrust::raw_pointer_cast(in_storage.data());
+
+  in.resize(2);
+  in[0] = 4;
+  in[1] = 2;
+
+  thrust::universal_vector<thrust::universal_vector<bool>> out_storage(1);
+  thrust::universal_vector<bool> &out = *thrust::raw_pointer_cast(out_storage.data());
+  out.resize(1);
+  out[0] = false;
+
+  test_universal_vector_access(in, out);
+}
+DECLARE_UNITTEST(TestUniversalVectorDeviceAccess);
+
+void TestConstUniversalVectorDeviceAccess()
+{
+  thrust::universal_vector<thrust::universal_vector<int>> in_storage(1);
+
+  {
+    thrust::universal_vector<int> &in = *thrust::raw_pointer_cast(in_storage.data());
+
+    in.resize(2);
+    in[0] = 4;
+    in[1] = 2;
+  }
+
+  const thrust::universal_vector<int> &in = *thrust::raw_pointer_cast(in_storage.data());
+
+  thrust::universal_vector<thrust::universal_vector<bool>> out_storage(1);
+  thrust::universal_vector<bool> &out = *thrust::raw_pointer_cast(out_storage.data());
+
+  out.resize(1);
+  out[0] = false;
+
+  test_universal_vector_access(in, out);
+}
+DECLARE_UNITTEST(TestConstUniversalVectorDeviceAccess);
diff --git a/testing/cuda/equal.cu b/testing/cuda/equal.cu
index 84eb7254d..c5e794ed5 100644
--- a/testing/cuda/equal.cu
+++ b/testing/cuda/equal.cu
@@ -4,6 +4,7 @@
 #include <thrust/execution_policy.h>
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3>
 __global__
 void equal_kernel(ExecutionPolicy exec, Iterator1 first1, Iterator1 last1, Iterator2 first2, Iterator3 result)
@@ -92,6 +93,7 @@ void TestEqualDeviceDevice(const size_t n)
   TestEqualDevice<T>(thrust::device, n);
 }
 DECLARE_VARIABLE_UNITTEST(TestEqualDeviceDevice);
+#endif
 
 
 void TestEqualCudaStreams()
diff --git a/testing/cuda/equal.mk b/testing/cuda/equal.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/equal.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/cuda/fill.cu b/testing/cuda/fill.cu
index 17cf58c54..ee0a51776 100644
--- a/testing/cuda/fill.cu
+++ b/testing/cuda/fill.cu
@@ -3,6 +3,7 @@
 #include <thrust/execution_policy.h>
 #include <algorithm>
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator, typename T>
 __global__
 void fill_kernel(ExecutionPolicy exec, Iterator first, Iterator last, T value)
@@ -169,6 +170,7 @@ void TestFillNDeviceDevice(size_t n)
   TestFillNDevice<T>(thrust::device, n);
 }
 DECLARE_VARIABLE_UNITTEST(TestFillNDeviceDevice);
+#endif
 
 void TestFillCudaStreams()
 {
diff --git a/testing/cuda/fill.mk b/testing/cuda/fill.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/fill.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/cuda/find.cu b/testing/cuda/find.cu
index 4fe6f4dca..fbd86f5a0 100644
--- a/testing/cuda/find.cu
+++ b/testing/cuda/find.cu
@@ -39,6 +39,7 @@ struct less_than_value_pred
 };
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator, typename T, typename Iterator2>
 __global__ void find_kernel(ExecutionPolicy exec, Iterator first, Iterator last, T value, Iterator2 result)
 {
@@ -219,6 +220,7 @@ void TestFindIfNotDeviceDevice()
   TestFindIfNotDevice(thrust::device);
 };
 DECLARE_UNITTEST(TestFindIfNotDeviceDevice);
+#endif
 
 
 void TestFindCudaStreams()
diff --git a/testing/cuda/find.mk b/testing/cuda/find.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/find.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/cuda/for_each.cu b/testing/cuda/for_each.cu
index be6a7738c..afd54c621 100644
--- a/testing/cuda/for_each.cu
+++ b/testing/cuda/for_each.cu
@@ -59,6 +59,7 @@ struct mark_present_for_each
 };
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator, typename Function>
 __global__ void for_each_kernel(ExecutionPolicy exec, Iterator first, Iterator last, Function f)
 {
@@ -202,6 +203,7 @@ void TestForEachNDeviceDevice(const size_t n)
   ASSERT_EQUAL(h_output, d_output);
 }
 DECLARE_VARIABLE_UNITTEST(TestForEachNDeviceDevice);
+#endif
 
 
 void TestForEachCudaStreams()
diff --git a/testing/cuda/for_each.mk b/testing/cuda/for_each.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/for_each.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/cuda/gather.cu b/testing/cuda/gather.cu
index a9a8c9333..6af4d4727 100644
--- a/testing/cuda/gather.cu
+++ b/testing/cuda/gather.cu
@@ -3,6 +3,7 @@
 #include <thrust/execution_policy.h>
 #include <algorithm>
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3>
 __global__
 void gather_kernel(ExecutionPolicy exec, Iterator1 map_first, Iterator1 map_last, Iterator2 elements_first, Iterator3 result)
@@ -56,6 +57,7 @@ void TestGatherDeviceDevice(const size_t n)
   TestGatherDevice<T>(thrust::device, n);
 }
 DECLARE_VARIABLE_UNITTEST(TestGatherDeviceDevice);
+#endif
 
 
 void TestGatherCudaStreams()
@@ -85,6 +87,7 @@ void TestGatherCudaStreams()
 DECLARE_UNITTEST(TestGatherCudaStreams);
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3, typename Iterator4, typename Predicate>
 __global__
 void gather_if_kernel(ExecutionPolicy exec, Iterator1 map_first, Iterator1 map_last, Iterator2 stencil_first, Iterator3 elements_first, Iterator4 result, Predicate pred)
@@ -157,6 +160,7 @@ void TestGatherIfDeviceDevice(const size_t n)
   TestGatherIfDevice<T>(thrust::device, n);
 }
 DECLARE_VARIABLE_UNITTEST(TestGatherIfDeviceDevice);
+#endif
 
 void TestGatherIfCudaStreams(void)
 {
diff --git a/testing/cuda/gather.mk b/testing/cuda/gather.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/gather.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/cuda/generate.cu b/testing/cuda/generate.cu
index c495e5563..407da920c 100644
--- a/testing/cuda/generate.cu
+++ b/testing/cuda/generate.cu
@@ -3,14 +3,6 @@
 #include <thrust/execution_policy.h>
 
 
-template<typename ExecutionPolicy, typename Iterator, typename Function>
-__global__
-void generate_kernel(ExecutionPolicy exec, Iterator first, Iterator last, Function f)
-{
-  thrust::generate(exec, first, last, f);
-}
-
-
 template<typename T>
 struct return_value
 {
@@ -24,6 +16,15 @@ struct return_value
 };
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
+template<typename ExecutionPolicy, typename Iterator, typename Function>
+__global__
+void generate_kernel(ExecutionPolicy exec, Iterator first, Iterator last, Function f)
+{
+  thrust::generate(exec, first, last, f);
+}
+
+
 template<typename T, typename ExecutionPolicy>
 void TestGenerateDevice(ExecutionPolicy exec, const size_t n)
 {
@@ -59,6 +60,7 @@ void TestGenerateDeviceDevice(const size_t n)
   TestGenerateDevice<T>(thrust::device, n);
 }
 DECLARE_VARIABLE_UNITTEST(TestGenerateDeviceDevice);
+#endif
 
 
 void TestGenerateCudaStreams()
@@ -86,6 +88,7 @@ void TestGenerateCudaStreams()
 DECLARE_UNITTEST(TestGenerateCudaStreams);
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator, typename Size, typename Function>
 __global__
 void generate_n_kernel(ExecutionPolicy exec, Iterator first, Size n, Function f)
@@ -129,6 +132,7 @@ void TestGenerateNDeviceDevice(const size_t n)
   TestGenerateNDevice<T>(thrust::device, n);
 }
 DECLARE_VARIABLE_UNITTEST(TestGenerateNDeviceDevice);
+#endif
 
 
 void TestGenerateNCudaStreams()
diff --git a/testing/cuda/generate.mk b/testing/cuda/generate.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/generate.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/cuda/inner_product.cu b/testing/cuda/inner_product.cu
index 3dbb1150c..0c2276942 100644
--- a/testing/cuda/inner_product.cu
+++ b/testing/cuda/inner_product.cu
@@ -3,6 +3,7 @@
 #include <thrust/execution_policy.h>
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename T, typename Iterator3>
 __global__
 void inner_product_kernel(ExecutionPolicy exec, Iterator1 first1, Iterator1 last1, Iterator2 first2, T init, Iterator3 result)
@@ -50,6 +51,7 @@ void TestInnerProductDeviceDevice()
   TestInnerProductDevice(thrust::device);
 };
 DECLARE_UNITTEST(TestInnerProductDeviceDevice);
+#endif
 
 
 void TestInnerProductCudaStreams()
diff --git a/testing/cuda/inner_product.mk b/testing/cuda/inner_product.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/inner_product.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/cuda/is_partitioned.cu b/testing/cuda/is_partitioned.cu
index 70379793b..468e17746 100644
--- a/testing/cuda/is_partitioned.cu
+++ b/testing/cuda/is_partitioned.cu
@@ -4,6 +4,7 @@
 #include <thrust/execution_policy.h>
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator, typename Predicate, typename Iterator2>
 __global__
 void is_partitioned_kernel(ExecutionPolicy exec, Iterator first, Iterator last, Predicate pred, Iterator2 result)
@@ -66,6 +67,7 @@ void TestIsPartitionedDeviceDevice()
   TestIsPartitionedDevice(thrust::device);
 }
 DECLARE_UNITTEST(TestIsPartitionedDeviceDevice);
+#endif
 
 
 void TestIsPartitionedCudaStreams()
diff --git a/testing/cuda/is_partitioned.mk b/testing/cuda/is_partitioned.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/is_partitioned.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/cuda/is_sorted.cu b/testing/cuda/is_sorted.cu
index c6e11f6fc..1e9ef16ae 100644
--- a/testing/cuda/is_sorted.cu
+++ b/testing/cuda/is_sorted.cu
@@ -3,6 +3,7 @@
 #include <thrust/execution_policy.h>
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator, typename Iterator2>
 __global__
 void is_sorted_kernel(ExecutionPolicy exec, Iterator first, Iterator last, Iterator2 result)
@@ -55,6 +56,7 @@ void TestIsSortedDeviceDevice()
   TestIsSortedDevice(thrust::device);
 }
 DECLARE_UNITTEST(TestIsSortedDeviceDevice);
+#endif
 
 
 void TestIsSortedCudaStreams()
diff --git a/testing/cuda/is_sorted.mk b/testing/cuda/is_sorted.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/is_sorted.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/cuda/is_sorted_until.cu b/testing/cuda/is_sorted_until.cu
index d84f09fca..9e6d5ac76 100644
--- a/testing/cuda/is_sorted_until.cu
+++ b/testing/cuda/is_sorted_until.cu
@@ -3,6 +3,7 @@
 #include <thrust/execution_policy.h>
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2>
 __global__
 void is_sorted_until_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 result)
@@ -57,6 +58,7 @@ void TestIsSortedUntilDeviceDevice()
   TestIsSortedUntilDevice(thrust::device);
 }
 DECLARE_UNITTEST(TestIsSortedUntilDeviceDevice);
+#endif
 
 
 void TestIsSortedUntilCudaStreams()
diff --git a/testing/cuda/is_sorted_until.mk b/testing/cuda/is_sorted_until.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/is_sorted_until.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/cuda/logical.cu b/testing/cuda/logical.cu
index 61e7dc49a..a08f041b7 100644
--- a/testing/cuda/logical.cu
+++ b/testing/cuda/logical.cu
@@ -4,6 +4,7 @@
 #include <thrust/execution_policy.h>
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator, typename Function, typename Iterator2>
 __global__
 void all_of_kernel(ExecutionPolicy exec, Iterator first, Iterator last, Function f, Iterator2 result)
@@ -83,6 +84,7 @@ void TestAllOfDeviceDevice()
   TestAllOfDevice(thrust::device);
 }
 DECLARE_UNITTEST(TestAllOfDeviceDevice);
+#endif
 
 
 void TestAllOfCudaStreams()
@@ -111,6 +113,7 @@ void TestAllOfCudaStreams()
 DECLARE_UNITTEST(TestAllOfCudaStreams);
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator, typename Function, typename Iterator2>
 __global__
 void any_of_kernel(ExecutionPolicy exec, Iterator first, Iterator last, Function f, Iterator2 result)
@@ -191,6 +194,7 @@ void TestAnyOfDeviceDevice()
   TestAnyOfDevice(thrust::device);
 }
 DECLARE_UNITTEST(TestAnyOfDeviceDevice);
+#endif
 
 
 void TestAnyOfCudaStreams()
@@ -219,6 +223,7 @@ void TestAnyOfCudaStreams()
 DECLARE_UNITTEST(TestAnyOfCudaStreams);
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator, typename Function, typename Iterator2>
 __global__
 void none_of_kernel(ExecutionPolicy exec, Iterator first, Iterator last, Function f, Iterator2 result)
@@ -299,6 +304,7 @@ void TestNoneOfDeviceDevice()
   TestNoneOfDevice(thrust::device);
 }
 DECLARE_UNITTEST(TestNoneOfDeviceDevice);
+#endif
 
 
 void TestNoneOfCudaStreams()
diff --git a/testing/cuda/logical.mk b/testing/cuda/logical.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/logical.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/cuda/managed_memory_pointer.mk b/testing/cuda/managed_memory_pointer.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/managed_memory_pointer.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/cuda/max_element.cu b/testing/cuda/max_element.cu
index a18d9656a..defc314d1 100644
--- a/testing/cuda/max_element.cu
+++ b/testing/cuda/max_element.cu
@@ -3,6 +3,7 @@
 #include <thrust/execution_policy.h>
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator, typename Iterator2>
 __global__
 void max_element_kernel(ExecutionPolicy exec, Iterator first, Iterator last, Iterator2 result)
@@ -67,7 +68,16 @@ void TestMaxElementDeviceDevice()
 DECLARE_UNITTEST(TestMaxElementDeviceDevice);
 
 
-void TestMaxElementCudaStreams()
+void TestMaxElementDeviceNoSync()
+{
+  TestMaxElementDevice(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestMaxElementDeviceNoSync);
+#endif
+
+
+template<typename ExecutionPolicy>
+void TestMaxElementCudaStreams(ExecutionPolicy policy)
 {
   typedef thrust::device_vector<int> Vector;
   typedef Vector::value_type T;
@@ -83,15 +93,28 @@ void TestMaxElementCudaStreams()
   cudaStream_t s;
   cudaStreamCreate(&s);
 
-  ASSERT_EQUAL( *thrust::max_element(thrust::cuda::par.on(s), data.begin(), data.end()), 5);
-  ASSERT_EQUAL( thrust::max_element(thrust::cuda::par.on(s), data.begin(), data.end()) - data.begin(), 1);
+  auto streampolicy = policy.on(s);
+
+  ASSERT_EQUAL( *thrust::max_element(streampolicy, data.begin(), data.end()), 5);
+  ASSERT_EQUAL( thrust::max_element(streampolicy, data.begin(), data.end()) - data.begin(), 1);
   
-  ASSERT_EQUAL( *thrust::max_element(thrust::cuda::par.on(s), data.begin(), data.end(), thrust::greater<T>()), 1);
-  ASSERT_EQUAL( thrust::max_element(thrust::cuda::par.on(s), data.begin(), data.end(), thrust::greater<T>()) - data.begin(), 2);
+  ASSERT_EQUAL( *thrust::max_element(streampolicy, data.begin(), data.end(), thrust::greater<T>()), 1);
+  ASSERT_EQUAL( thrust::max_element(streampolicy, data.begin(), data.end(), thrust::greater<T>()) - data.begin(), 2);
 
   cudaStreamDestroy(s);
 }
-DECLARE_UNITTEST(TestMaxElementCudaStreams);
+
+void TestMaxElementCudaStreamsSync(){
+  TestMaxElementCudaStreams(thrust::cuda::par);
+}
+DECLARE_UNITTEST(TestMaxElementCudaStreamsSync);
+
+
+void TestMaxElementCudaStreamsNoSync(){
+  TestMaxElementCudaStreams(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestMaxElementCudaStreamsNoSync);
+
 
 void TestMaxElementDevicePointer()
 {
diff --git a/testing/cuda/max_element.mk b/testing/cuda/max_element.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/max_element.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/cuda/memory.cu b/testing/cuda/memory.cu
index ed9acec55..eda432ca8 100644
--- a/testing/cuda/memory.cu
+++ b/testing/cuda/memory.cu
@@ -35,6 +35,7 @@ void TestSelectSystemCudaToCpp()
 DECLARE_UNITTEST(TestSelectSystemCudaToCpp);
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename Iterator>
 __global__ void get_temporary_buffer_kernel(size_t n, Iterator result)
 {
@@ -43,9 +44,9 @@ __global__ void get_temporary_buffer_kernel(size_t n, Iterator result)
 
 
 template<typename Pointer>
-__global__ void return_temporary_buffer_kernel(Pointer ptr)
+__global__ void return_temporary_buffer_kernel(Pointer ptr, std::ptrdiff_t n)
 {
-  thrust::return_temporary_buffer(thrust::seq, ptr);
+  thrust::return_temporary_buffer(thrust::seq, ptr, n);
 }
 
 
@@ -58,8 +59,10 @@ void TestGetTemporaryBufferDeviceSeq()
   thrust::device_vector<ptr_and_sz_type> d_result(1);
   
   get_temporary_buffer_kernel<<<1,1>>>(n, d_result.begin());
-  cudaError_t const err = cudaDeviceSynchronize();
-  ASSERT_EQUAL(cudaSuccess, err);
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
 
   ptr_and_sz_type ptr_and_sz = d_result[0];
 
@@ -74,9 +77,11 @@ void TestGetTemporaryBufferDeviceSeq()
 
     ASSERT_EQUAL(true, thrust::all_of(thrust::device, ptr_and_sz.first, ptr_and_sz.first + n, thrust::placeholders::_1 == ref_val));
 
-    return_temporary_buffer_kernel<<<1,1>>>(ptr_and_sz.first);
-    cudaError_t const err = cudaDeviceSynchronize();
-    ASSERT_EQUAL(cudaSuccess, err);
+    return_temporary_buffer_kernel<<<1,1>>>(ptr_and_sz.first, ptr_and_sz.second);
+    {
+      cudaError_t const err = cudaDeviceSynchronize();
+      ASSERT_EQUAL(cudaSuccess, err);
+    }
   }
 }
 DECLARE_UNITTEST(TestGetTemporaryBufferDeviceSeq);
@@ -104,8 +109,10 @@ void TestMallocDeviceSeq()
   thrust::device_vector<pointer> d_result(1);
   
   malloc_kernel<<<1,1>>>(n, d_result.begin());
-  cudaError_t const err = cudaDeviceSynchronize();
-  ASSERT_EQUAL(cudaSuccess, err);
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
 
   pointer ptr = d_result[0];
 
@@ -119,9 +126,12 @@ void TestMallocDeviceSeq()
     ASSERT_EQUAL(true, thrust::all_of(thrust::device, ptr, ptr + n, thrust::placeholders::_1 == ref_val));
 
     free_kernel<<<1,1>>>(ptr);
-    cudaError_t const err = cudaDeviceSynchronize();
-    ASSERT_EQUAL(cudaSuccess, err);
+    {
+      cudaError_t const err = cudaDeviceSynchronize();
+      ASSERT_EQUAL(cudaSuccess, err);
+    }
   }
 }
 DECLARE_UNITTEST(TestMallocDeviceSeq);
+#endif
 
diff --git a/testing/cuda/memory.mk b/testing/cuda/memory.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/memory.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/cuda/merge.cu b/testing/cuda/merge.cu
index 5e13b9d3a..1a96e8774 100644
--- a/testing/cuda/merge.cu
+++ b/testing/cuda/merge.cu
@@ -6,6 +6,7 @@
 #include <thrust/execution_policy.h>
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3, typename Iterator4>
 __global__
 void merge_kernel(ExecutionPolicy exec,
@@ -80,6 +81,7 @@ void TestMergeDeviceDevice()
   TestMergeDevice(thrust::device);
 }
 DECLARE_UNITTEST(TestMergeDeviceDevice);
+#endif
 
 
 void TestMergeCudaStreams()
diff --git a/testing/cuda/merge.mk b/testing/cuda/merge.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/merge.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/cuda/merge_by_key.cu b/testing/cuda/merge_by_key.cu
index 84b80e007..40ea542df 100644
--- a/testing/cuda/merge_by_key.cu
+++ b/testing/cuda/merge_by_key.cu
@@ -5,6 +5,7 @@
 #include <thrust/execution_policy.h>
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy,
          typename Iterator1,
          typename Iterator2,
@@ -84,6 +85,7 @@ void TestMergeByKeyDeviceDevice()
   TestMergeByKeyDevice(thrust::device);
 }
 DECLARE_UNITTEST(TestMergeByKeyDeviceDevice);
+#endif
 
 
 void TestMergeByKeyCudaStreams()
diff --git a/testing/cuda/merge_by_key.mk b/testing/cuda/merge_by_key.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/merge_by_key.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/cuda/merge_sort.mk b/testing/cuda/merge_sort.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/merge_sort.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/cuda/min_element.cu b/testing/cuda/min_element.cu
index 49d13c2a5..38dd96b11 100644
--- a/testing/cuda/min_element.cu
+++ b/testing/cuda/min_element.cu
@@ -3,6 +3,7 @@
 #include <thrust/execution_policy.h>
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator, typename Iterator2>
 __global__
 void min_element_kernel(ExecutionPolicy exec, Iterator first, Iterator last, Iterator2 result)
@@ -64,6 +65,7 @@ void TestMinElementDeviceDevice()
   TestMinElementDevice(thrust::device);
 }
 DECLARE_UNITTEST(TestMinElementDeviceDevice);
+#endif
 
 
 void TestMinElementCudaStreams()
diff --git a/testing/cuda/min_element.mk b/testing/cuda/min_element.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/min_element.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/cuda/minmax_element.cu b/testing/cuda/minmax_element.cu
index e3cae07a2..6376bc28b 100644
--- a/testing/cuda/minmax_element.cu
+++ b/testing/cuda/minmax_element.cu
@@ -2,6 +2,7 @@
 #include <thrust/extrema.h>
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2>
 __global__
 void minmax_element_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 result)
@@ -85,6 +86,7 @@ void TestMinMaxElementDeviceDevice()
   TestMinMaxElementDevice(thrust::device);
 }
 DECLARE_UNITTEST(TestMinMaxElementDeviceDevice);
+#endif
 
 
 void TestMinMaxElementCudaStreams()
diff --git a/testing/cuda/minmax_element.mk b/testing/cuda/minmax_element.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/minmax_element.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/cuda/mismatch.cu b/testing/cuda/mismatch.cu
index 5b08f4307..aac89352a 100644
--- a/testing/cuda/mismatch.cu
+++ b/testing/cuda/mismatch.cu
@@ -3,6 +3,7 @@
 #include <thrust/execution_policy.h>
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3>
 __global__ void mismatch_kernel(ExecutionPolicy exec, Iterator1 first1, Iterator1 last1, Iterator2 first2, Iterator3 result)
 {
@@ -72,6 +73,7 @@ void TestMismatchDeviceDevice()
   TestMismatchDevice(thrust::device);
 }
 DECLARE_UNITTEST(TestMismatchDeviceDevice);
+#endif
 
 
 void TestMismatchCudaStreams()
diff --git a/testing/cuda/mismatch.mk b/testing/cuda/mismatch.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/mismatch.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/cuda/pair_sort.cu b/testing/cuda/pair_sort.cu
index 87838e429..da23e4cb2 100644
--- a/testing/cuda/pair_sort.cu
+++ b/testing/cuda/pair_sort.cu
@@ -4,16 +4,12 @@
 #include <thrust/execution_policy.h>
 
 
-template<typename ExecutionPolicy, typename Iterator1, typename Iterator2>
+#ifdef THRUST_TEST_DEVICE_SIDE
+template<typename ExecutionPolicy, typename Iterator>
 __global__
-void stable_sort_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 is_supported)
+void stable_sort_kernel(ExecutionPolicy exec, Iterator first, Iterator last)
 {
-#if (__CUDA_ARCH__ >= 200)
-  *is_supported = true;
   thrust::stable_sort(exec, first, last);
-#else
-  *is_supported = false;
-#endif
 }
 
 
@@ -43,19 +39,14 @@ void TestPairStableSortDevice(ExecutionPolicy exec)
 
   thrust::device_vector<P> d_pairs = h_pairs;
 
-  thrust::device_vector<bool> is_supported(1);
-
-  stable_sort_kernel<<<1,1>>>(exec, d_pairs.begin(), d_pairs.end(), is_supported.begin());
+  stable_sort_kernel<<<1,1>>>(exec, d_pairs.begin(), d_pairs.end());
   cudaError_t const err = cudaDeviceSynchronize();
   ASSERT_EQUAL(cudaSuccess, err);
 
-  if(is_supported[0])
-  {
-    // sort on the host
-    thrust::stable_sort(h_pairs.begin(), h_pairs.end());
+  // sort on the host
+  thrust::stable_sort(h_pairs.begin(), h_pairs.end());
 
-    ASSERT_EQUAL_QUIET(h_pairs, d_pairs);
-  }
+  ASSERT_EQUAL_QUIET(h_pairs, d_pairs);
 };
 
 
@@ -71,4 +62,5 @@ void TestPairStableSortDeviceDevice()
   TestPairStableSortDevice(thrust::device);
 }
 DECLARE_UNITTEST(TestPairStableSortDeviceDevice);
+#endif
 
diff --git a/testing/cuda/pair_sort.mk b/testing/cuda/pair_sort.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/pair_sort.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/cuda/pair_sort_by_key.cu b/testing/cuda/pair_sort_by_key.cu
index 19996e5a2..fa229b8a6 100644
--- a/testing/cuda/pair_sort_by_key.cu
+++ b/testing/cuda/pair_sort_by_key.cu
@@ -6,16 +6,12 @@
 #include <thrust/execution_policy.h>
 
 
-template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3>
+#ifdef THRUST_TEST_DEVICE_SIDE
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2>
 __global__
-void stable_sort_by_key_kernel(ExecutionPolicy exec, Iterator1 keys_first, Iterator1 keys_last, Iterator2 values_first, Iterator3 is_supported)
+void stable_sort_by_key_kernel(ExecutionPolicy exec, Iterator1 keys_first, Iterator1 keys_last, Iterator2 values_first)
 {
-#if (__CUDA_ARCH__ >= 200)
-  *is_supported = true;
   thrust::stable_sort_by_key(exec, keys_first, keys_last, values_first);
-#else
-  *is_supported = false;
-#endif
 }
 
 
@@ -51,21 +47,16 @@ void TestPairStableSortByKeyDevice(ExecutionPolicy exec)
   thrust::device_vector<P>   d_pairs = h_pairs;
   thrust::device_vector<int> d_values = h_values;
 
-  thrust::device_vector<bool> is_supported(1);
-
   // sort on the device
-  stable_sort_by_key_kernel<<<1,1>>>(exec, d_pairs.begin(), d_pairs.end(), d_values.begin(), is_supported.begin());
+  stable_sort_by_key_kernel<<<1,1>>>(exec, d_pairs.begin(), d_pairs.end(), d_values.begin());
   cudaError_t const err = cudaDeviceSynchronize();
   ASSERT_EQUAL(cudaSuccess, err);
 
-  if(is_supported[0])
-  {
-    // sort on the host
-    thrust::stable_sort_by_key(h_pairs.begin(), h_pairs.end(), h_values.begin());
+  // sort on the host
+  thrust::stable_sort_by_key(h_pairs.begin(), h_pairs.end(), h_values.begin());
 
-    ASSERT_EQUAL_QUIET(h_pairs,  d_pairs);
-    ASSERT_EQUAL(h_values, d_values);
-  }
+  ASSERT_EQUAL_QUIET(h_pairs,  d_pairs);
+  ASSERT_EQUAL(h_values, d_values);
 };
 
 
@@ -81,4 +72,5 @@ void TestPairStableSortByKeyDeviceDevice()
   TestPairStableSortByKeyDevice(thrust::device);
 }
 DECLARE_UNITTEST(TestPairStableSortByKeyDeviceDevice);
+#endif
 
diff --git a/testing/cuda/pair_sort_by_key.mk b/testing/cuda/pair_sort_by_key.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/pair_sort_by_key.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/cuda/partition.cu b/testing/cuda/partition.cu
index a70ac0732..f8701db6f 100644
--- a/testing/cuda/partition.cu
+++ b/testing/cuda/partition.cu
@@ -4,14 +4,6 @@
 #include <thrust/execution_policy.h>
 
 
-template<typename ExecutionPolicy, typename Iterator1, typename Predicate, typename Iterator2>
-__global__
-void partition_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Predicate pred, Iterator2 result)
-{
-  *result = thrust::partition(exec, first, last, pred);
-}
-
-
 template<typename T>
 struct is_even
 {
@@ -20,6 +12,15 @@ struct is_even
 };
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
+template<typename ExecutionPolicy, typename Iterator1, typename Predicate, typename Iterator2>
+__global__
+void partition_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Predicate pred, Iterator2 result)
+{
+  *result = thrust::partition(exec, first, last, pred);
+}
+
+
 template<typename ExecutionPolicy>
 void TestPartitionDevice(ExecutionPolicy exec)
 {
@@ -65,6 +66,13 @@ void TestPartitionDeviceDevice()
 DECLARE_UNITTEST(TestPartitionDeviceDevice);
 
 
+void TestPartitionDeviceNoSync()
+{
+  TestPartitionDevice(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestPartitionDeviceNoSync);
+
+
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Predicate, typename Iterator3>
 __global__
 void partition_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 stencil_first, Predicate pred, Iterator3 result)
@@ -125,6 +133,13 @@ void TestPartitionStencilDeviceDevice()
 DECLARE_UNITTEST(TestPartitionStencilDeviceDevice);
 
 
+void TestPartitionStencilDeviceNoSync()
+{
+  TestPartitionStencilDevice(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestPartitionStencilDeviceNoSync);
+
+
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3, typename Predicate, typename Iterator4>
 __global__
 void partition_copy_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 true_result, Iterator3 false_result, Predicate pred, Iterator4 result)
@@ -188,6 +203,13 @@ void TestPartitionCopyDeviceDevice()
 DECLARE_UNITTEST(TestPartitionCopyDeviceDevice);
 
 
+void TestPartitionCopyDeviceNoSync()
+{
+  TestPartitionCopyDevice(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestPartitionCopyDeviceNoSync);
+
+
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3, typename Iterator4, typename Predicate, typename Iterator5>
 __global__
 void partition_copy_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 stencil_first, Iterator3 true_result, Iterator4 false_result, Predicate pred, Iterator5 result)
@@ -258,16 +280,18 @@ void TestPartitionCopyStencilDeviceDevice()
 DECLARE_UNITTEST(TestPartitionCopyStencilDeviceDevice);
 
 
-template<typename ExecutionPolicy, typename Iterator1, typename Predicate, typename Iterator2, typename Iterator3>
+void TestPartitionCopyStencilDeviceNoSync()
+{
+  TestPartitionCopyStencilDevice(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestPartitionCopyStencilDeviceNoSync);
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Predicate, typename Iterator2>
 __global__
-void stable_partition_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Predicate pred, Iterator2 result, Iterator3 is_supported)
+void stable_partition_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Predicate pred, Iterator2 result)
 {
-#if (__CUDA_ARCH__ >= 200)
-  *is_supported = true;
   *result = thrust::stable_partition(exec, first, last, pred);
-#else
-  *is_supported = false;
-#endif
 }
 
 
@@ -285,24 +309,20 @@ void TestStablePartitionDevice(ExecutionPolicy exec)
   data[4] = 2; 
 
   thrust::device_vector<iterator> result(1);
-  thrust::device_vector<bool> is_supported(1);
-  
-  stable_partition_kernel<<<1,1>>>(exec, data.begin(), data.end(), is_even<T>(), result.begin(), is_supported.begin());
+
+  stable_partition_kernel<<<1,1>>>(exec, data.begin(), data.end(), is_even<T>(), result.begin());
   cudaError_t const err = cudaDeviceSynchronize();
   ASSERT_EQUAL(cudaSuccess, err);
   
-  if(is_supported[0])
-  {
-    thrust::device_vector<T> ref(5);
-    ref[0] = 2;
-    ref[1] = 2;
-    ref[2] = 1;
-    ref[3] = 1;
-    ref[4] = 1;
+  thrust::device_vector<T> ref(5);
+  ref[0] = 2;
+  ref[1] = 2;
+  ref[2] = 1;
+  ref[3] = 1;
+  ref[4] = 1;
     
-    ASSERT_EQUAL(2, (iterator)result[0] - data.begin());
-    ASSERT_EQUAL(ref, data);
-  }
+  ASSERT_EQUAL(2, (iterator)result[0] - data.begin());
+  ASSERT_EQUAL(ref, data);
 }
 
 
@@ -320,16 +340,18 @@ void TestStablePartitionDeviceDevice()
 DECLARE_UNITTEST(TestStablePartitionDeviceDevice);
 
 
-template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Predicate, typename Iterator3, typename Iterator4>
+void TestStablePartitionDeviceNoSync()
+{
+  TestStablePartitionDevice(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestStablePartitionDeviceNoSync);
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Predicate, typename Iterator3>
 __global__
-void stable_partition_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 stencil_first, Predicate pred, Iterator3 result, Iterator4 is_supported)
+void stable_partition_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 stencil_first, Predicate pred, Iterator3 result)
 {
-#if (__CUDA_ARCH__ >= 200)
-  *is_supported = true;
   *result = thrust::stable_partition(exec, first, last, stencil_first, pred);
-#else
-  *is_supported = false;
-#endif
 }
 
 
@@ -354,24 +376,20 @@ void TestStablePartitionStencilDevice(ExecutionPolicy exec)
   stencil[4] = 2; 
 
   thrust::device_vector<iterator> result(1);
-  thrust::device_vector<bool> is_supported(1);
-  
-  stable_partition_kernel<<<1,1>>>(exec, data.begin(), data.end(), stencil.begin(), is_even<T>(), result.begin(), is_supported.begin());
+
+  stable_partition_kernel<<<1,1>>>(exec, data.begin(), data.end(), stencil.begin(), is_even<T>(), result.begin());
   cudaError_t const err = cudaDeviceSynchronize();
   ASSERT_EQUAL(cudaSuccess, err);
   
-  if(is_supported[0])
-  {
-    thrust::device_vector<T> ref(5);
-    ref[0] = 1;
-    ref[1] = 1;
-    ref[2] = 0;
-    ref[3] = 0;
-    ref[4] = 0;
+  thrust::device_vector<T> ref(5);
+  ref[0] = 1;
+  ref[1] = 1;
+  ref[2] = 0;
+  ref[3] = 0;
+  ref[4] = 0;
     
-    ASSERT_EQUAL(2, (iterator)result[0] - data.begin());
-    ASSERT_EQUAL(ref, data);
-  }
+  ASSERT_EQUAL(2, (iterator)result[0] - data.begin());
+  ASSERT_EQUAL(ref, data);
 }
 
 
@@ -389,6 +407,13 @@ void TestStablePartitionStencilDeviceDevice()
 DECLARE_UNITTEST(TestStablePartitionStencilDeviceDevice);
 
 
+void TestStablePartitionStencilDeviceNoSync()
+{
+  TestStablePartitionStencilDevice(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestStablePartitionStencilDeviceNoSync);
+
+
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3, typename Predicate, typename Iterator4>
 __global__
 void stable_partition_copy_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 true_result, Iterator3 false_result, Predicate pred, Iterator4 result)
@@ -452,6 +477,13 @@ void TestStablePartitionCopyDeviceDevice()
 DECLARE_UNITTEST(TestStablePartitionCopyDeviceDevice);
 
 
+void TestStablePartitionCopyDeviceNoSync()
+{
+  TestStablePartitionCopyDevice(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestStablePartitionCopyDeviceNoSync);
+
+
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3, typename Iterator4, typename Predicate, typename Iterator5>
 __global__
 void stable_partition_copy_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 stencil_first, Iterator3 true_result, Iterator4 false_result, Predicate pred, Iterator5 result)
@@ -522,7 +554,16 @@ void TestStablePartitionCopyStencilDeviceDevice()
 DECLARE_UNITTEST(TestStablePartitionCopyStencilDeviceDevice);
 
 
-void TestPartitionCudaStreams()
+void TestStablePartitionCopyStencilDeviceNoSync()
+{
+  TestStablePartitionCopyStencilDevice(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestStablePartitionCopyStencilDeviceNoSync);
+#endif
+
+
+template<typename ExecutionPolicy>
+void TestPartitionCudaStreams(ExecutionPolicy policy)
 {
   typedef thrust::device_vector<int> Vector;
   typedef Vector::value_type T;
@@ -537,8 +578,10 @@ void TestPartitionCudaStreams()
 
   cudaStream_t s;
   cudaStreamCreate(&s);
+
+  auto streampolicy = policy.on(s);
   
-  Iterator iter = thrust::partition(thrust::cuda::par.on(s), data.begin(), data.end(), is_even<T>());
+  Iterator iter = thrust::partition(streampolicy, data.begin(), data.end(), is_even<T>());
   
   Vector ref(5);
   ref[0] = 2;
@@ -552,5 +595,17 @@ void TestPartitionCudaStreams()
 
   cudaStreamDestroy(s);
 }
-DECLARE_UNITTEST(TestPartitionCudaStreams);
+
+void TestPartitionCudaStreamsSync()
+{
+  TestPartitionCudaStreams(thrust::cuda::par);
+}
+DECLARE_UNITTEST(TestPartitionCudaStreamsSync);
+
+
+void TestPartitionCudaStreamsNoSync()
+{
+  TestPartitionCudaStreams(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestPartitionCudaStreamsNoSync);
 
diff --git a/testing/cuda/partition.mk b/testing/cuda/partition.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/partition.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/cuda/partition_point.cu b/testing/cuda/partition_point.cu
index 0b95fcb02..57e4344ee 100644
--- a/testing/cuda/partition_point.cu
+++ b/testing/cuda/partition_point.cu
@@ -4,6 +4,7 @@
 #include <thrust/execution_policy.h>
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator1, typename Predicate, typename Iterator2>
 __global__
 void partition_point_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Predicate pred, Iterator2 result)
@@ -50,6 +51,7 @@ void TestPartitionPointDeviceDevice()
   TestPartitionPointDevice(thrust::device);
 }
 DECLARE_UNITTEST(TestPartitionPointDeviceDevice);
+#endif
 
 
 void TestPartitionPointCudaStreams()
diff --git a/testing/cuda/partition_point.mk b/testing/cuda/partition_point.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/partition_point.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/cuda/pinned_allocator.cu b/testing/cuda/pinned_allocator.cu
deleted file mode 100644
index 23ccc7d40..000000000
--- a/testing/cuda/pinned_allocator.cu
+++ /dev/null
@@ -1,19 +0,0 @@
-#include <unittest/unittest.h>
-#include <thrust/system/cuda/experimental/pinned_allocator.h>
-#include <thrust/host_vector.h>
-#include <thrust/copy.h>
-
-template <typename T>
-void TestPinnedAllocatorSimple(const size_t n)
-{
-  typedef thrust::host_vector<T, thrust::cuda::experimental::pinned_allocator<T> > Vector;
-
-  Vector h_input = unittest::random_integers<T>(n);
-  Vector h_output(n);
-
-  thrust::copy(h_input.begin(), h_input.end(), h_output.begin());
-
-  ASSERT_EQUAL(h_input, h_output);
-}
-DECLARE_VARIABLE_UNITTEST(TestPinnedAllocatorSimple);
-
diff --git a/testing/cuda/reduce.cu b/testing/cuda/reduce.cu
index 9cefcc0ed..865d31c22 100644
--- a/testing/cuda/reduce.cu
+++ b/testing/cuda/reduce.cu
@@ -1,6 +1,7 @@
 #include <unittest/unittest.h>
 #include <thrust/reduce.h>
 #include <thrust/execution_policy.h>
+#include <thrust/iterator/constant_iterator.h>
 
 
 template<typename ExecutionPolicy, typename Iterator, typename T, typename Iterator2>
@@ -11,6 +12,7 @@ void reduce_kernel(ExecutionPolicy exec, Iterator first, Iterator last, T init,
 }
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename T, typename ExecutionPolicy>
 void TestReduceDevice(ExecutionPolicy exec, const size_t n)
 {
@@ -53,7 +55,20 @@ struct TestReduceDeviceDevice
 VariableUnitTest<TestReduceDeviceDevice, IntegralTypes> TestReduceDeviceDeviceInstance;
 
 
-void TestReduceCudaStreams()
+template<typename T>
+struct TestReduceDeviceNoSync
+{
+  void operator()(const size_t n)
+  {
+    TestReduceDevice<T>(thrust::cuda::par_nosync, n);
+  }
+};
+VariableUnitTest<TestReduceDeviceNoSync, IntegralTypes> TestReduceDeviceNoSyncInstance;
+#endif
+
+
+template<typename ExecutionPolicy>
+void TestReduceCudaStreams(ExecutionPolicy policy)
 {
   typedef thrust::device_vector<int> Vector;
 
@@ -63,13 +78,46 @@ void TestReduceCudaStreams()
   cudaStream_t s;
   cudaStreamCreate(&s);
 
+  auto streampolicy = policy.on(s);
+
   // no initializer
-  ASSERT_EQUAL(thrust::reduce(thrust::cuda::par.on(s), v.begin(), v.end()), 2);
+  ASSERT_EQUAL(thrust::reduce(streampolicy, v.begin(), v.end()), 2);
 
   // with initializer
-  ASSERT_EQUAL(thrust::reduce(thrust::cuda::par.on(s), v.begin(), v.end(), 10), 12);
+  ASSERT_EQUAL(thrust::reduce(streampolicy, v.begin(), v.end(), 10), 12);
 
   cudaStreamDestroy(s);
 }
-DECLARE_UNITTEST(TestReduceCudaStreams);
+
+void TestReduceCudaStreamsSync()
+{
+  TestReduceCudaStreams(thrust::cuda::par);
+}
+DECLARE_UNITTEST(TestReduceCudaStreamsSync);
+
+
+void TestReduceCudaStreamsNoSync()
+{
+  TestReduceCudaStreams(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestReduceCudaStreamsNoSync);
+
+#if defined(THRUST_RDC_ENABLED)
+void TestReduceLargeInput()
+{
+  using T = unsigned long long;
+  using OffsetT = std::size_t;
+  const OffsetT num_items = 1ull << 32;
+
+  thrust::constant_iterator<T> d_data(T{1});
+  thrust::device_vector<T> d_result(1);
+
+  reduce_kernel<<<1,1>>>(thrust::device, d_data, d_data + num_items, T{}, d_result.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+  
+  ASSERT_EQUAL(num_items, d_result[0]);
+}
+DECLARE_UNITTEST(TestReduceLargeInput);
+#endif
 
diff --git a/testing/cuda/reduce.mk b/testing/cuda/reduce.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/reduce.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/cuda/reduce_by_key.cu b/testing/cuda/reduce_by_key.cu
index 993a39bd4..20f44fb42 100644
--- a/testing/cuda/reduce_by_key.cu
+++ b/testing/cuda/reduce_by_key.cu
@@ -1,8 +1,14 @@
-#include <unittest/unittest.h>
-#include <thrust/reduce.h>
+#include <thrust/equal.h>
 #include <thrust/execution_policy.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/reduce.h>
+#include <unittest/unittest.h>
 
+#include <cstdint>
 
+
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3, typename Iterator4, typename Iterator5>
 __global__
 void reduce_by_key_kernel(ExecutionPolicy exec,
@@ -43,6 +49,7 @@ void reduce_by_key_kernel(ExecutionPolicy exec,
 {
   *result = thrust::reduce_by_key(exec, keys_first, keys_last, values_first, keys_result, values_result, pred, binary_op);
 }
+#endif
 
 
 template<typename T>
@@ -85,6 +92,7 @@ void initialize_values(Vector& values)
 }
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy>
 void TestReduceByKeyDevice(ExecutionPolicy exec)
 {
@@ -191,7 +199,16 @@ void TestReduceByKeyDeviceDevice()
 DECLARE_UNITTEST(TestReduceByKeyDeviceDevice);
 
 
-void TestReduceByKeyCudaStreams()
+void TestReduceByKeyDeviceNoSync()
+{
+  TestReduceByKeyDevice(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestReduceByKeyDeviceNoSync);
+#endif
+
+
+template<typename ExecutionPolicy>
+void TestReduceByKeyCudaStreams(ExecutionPolicy policy)
 {
   typedef thrust::device_vector<int> Vector;
   typedef Vector::value_type T;
@@ -210,7 +227,9 @@ void TestReduceByKeyCudaStreams()
   cudaStream_t s;
   cudaStreamCreate(&s);
 
-  new_last = thrust::reduce_by_key(thrust::cuda::par.on(s), keys.begin(), keys.end(), values.begin(), output_keys.begin(), output_values.begin());
+  auto streampolicy = policy.on(s);
+
+  new_last = thrust::reduce_by_key(streampolicy, keys.begin(), keys.end(), values.begin(), output_keys.begin(), output_values.begin());
 
   ASSERT_EQUAL(new_last.first  - output_keys.begin(),   5);
   ASSERT_EQUAL(new_last.second - output_values.begin(), 5);
@@ -229,7 +248,7 @@ void TestReduceByKeyCudaStreams()
   // test BinaryPredicate
   initialize_keys(keys);  initialize_values(values);
   
-  new_last = thrust::reduce_by_key(thrust::cuda::par.on(s), keys.begin(), keys.end(), values.begin(), output_keys.begin(), output_values.begin(), is_equal_div_10_reduce<T>());
+  new_last = thrust::reduce_by_key(streampolicy, keys.begin(), keys.end(), values.begin(), output_keys.begin(), output_values.begin(), is_equal_div_10_reduce<T>());
 
   ASSERT_EQUAL(new_last.first  - output_keys.begin(),   3);
   ASSERT_EQUAL(new_last.second - output_values.begin(), 3);
@@ -244,7 +263,7 @@ void TestReduceByKeyCudaStreams()
   // test BinaryFunction
   initialize_keys(keys);  initialize_values(values);
 
-  new_last = thrust::reduce_by_key(thrust::cuda::par.on(s), keys.begin(), keys.end(), values.begin(), output_keys.begin(), output_values.begin(), thrust::equal_to<T>(), thrust::plus<T>());
+  new_last = thrust::reduce_by_key(streampolicy, keys.begin(), keys.end(), values.begin(), output_keys.begin(), output_values.begin(), thrust::equal_to<T>(), thrust::plus<T>());
 
   ASSERT_EQUAL(new_last.first  - output_keys.begin(),   5);
   ASSERT_EQUAL(new_last.second - output_values.begin(), 5);
@@ -262,5 +281,120 @@ void TestReduceByKeyCudaStreams()
 
   cudaStreamDestroy(s);
 }
-DECLARE_UNITTEST(TestReduceByKeyCudaStreams);
 
+void TestReduceByKeyCudaStreamsSync()
+{
+  TestReduceByKeyCudaStreams(thrust::cuda::par);
+}
+DECLARE_UNITTEST(TestReduceByKeyCudaStreamsSync);
+
+
+void TestReduceByKeyCudaStreamsNoSync()
+{
+  TestReduceByKeyCudaStreams(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestReduceByKeyCudaStreamsNoSync);
+
+
+// Maps indices to key ids
+class div_op : public thrust::unary_function<std::int64_t, std::int64_t>
+{
+  std::int64_t m_divisor;
+
+public:
+  __host__ div_op(std::int64_t divisor)
+    : m_divisor(divisor)
+  {}
+
+  __host__ __device__
+  std::int64_t operator()(std::int64_t x) const
+  {
+    return x / m_divisor;
+  }
+};
+
+// Produces unique sequence for key
+class mod_op : public thrust::unary_function<std::int64_t, std::int64_t>
+{
+  std::int64_t m_divisor;
+
+public:
+  __host__ mod_op(std::int64_t divisor)
+    : m_divisor(divisor)
+  {}
+
+  __host__ __device__
+  std::int64_t operator()(std::int64_t x) const
+  {
+    // div: 2          
+    // idx: 0 1   2 3   4 5 
+    // key: 0 0 | 1 1 | 2 2 
+    // mod: 0 1 | 0 1 | 0 1
+    // ret: 0 1   1 2   2 3
+    return (x % m_divisor) + (x / m_divisor);
+  }
+};
+
+
+void TestReduceByKeyWithBigIndexesHelper(int magnitude)
+{
+  const std::int64_t key_size_magnitude = 8;
+  ASSERT_EQUAL(true, key_size_magnitude < magnitude);
+
+  const std::int64_t num_items       = 1ll << magnitude;
+  const std::int64_t num_unique_keys = 1ll << key_size_magnitude;
+
+  // Size of each key group
+  const std::int64_t key_size = num_items / num_unique_keys;
+
+  using counting_it      = thrust::counting_iterator<std::int64_t>;
+  using transform_key_it = thrust::transform_iterator<div_op, counting_it>;
+  using transform_val_it = thrust::transform_iterator<mod_op, counting_it>;
+
+  counting_it count_begin(0ll);
+  counting_it count_end = count_begin + num_items;
+  ASSERT_EQUAL(static_cast<std::int64_t>(thrust::distance(count_begin, count_end)),
+               num_items);
+
+  transform_key_it keys_begin(count_begin, div_op{key_size});
+  transform_key_it keys_end(count_end, div_op{key_size});
+
+  transform_val_it values_begin(count_begin, mod_op{key_size});
+
+  thrust::device_vector<std::int64_t> output_keys(num_unique_keys);
+  thrust::device_vector<std::int64_t> output_values(num_unique_keys);
+
+  // example:
+  //  items:        6
+  //  unique_keys:  2
+  //  key_size:     3
+  //  keys:         0 0 0 | 1 1 1 
+  //  values:       0 1 2 | 1 2 3
+  //  result:       3       6     = sum(range(key_size)) + key_size * key_id
+  thrust::reduce_by_key(keys_begin,
+                        keys_end,
+                        values_begin,
+                        output_keys.begin(),
+                        output_values.begin());
+
+  ASSERT_EQUAL(
+    true,
+    thrust::equal(output_keys.begin(), output_keys.end(), count_begin));
+
+  thrust::host_vector<std::int64_t> result = output_values;
+
+  const std::int64_t sum = (key_size - 1) * key_size / 2;
+  for (std::int64_t key_id = 0; key_id < num_unique_keys; key_id++)
+  {
+    ASSERT_EQUAL(result[key_id], sum + key_id * key_size);
+  }
+}
+
+void TestReduceByKeyWithBigIndexes()
+{
+  TestReduceByKeyWithBigIndexesHelper(30);
+  TestReduceByKeyWithBigIndexesHelper(31);
+  TestReduceByKeyWithBigIndexesHelper(32);
+  TestReduceByKeyWithBigIndexesHelper(33);
+}
+DECLARE_UNITTEST(TestReduceByKeyWithBigIndexes);
diff --git a/testing/cuda/reduce_by_key.mk b/testing/cuda/reduce_by_key.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/reduce_by_key.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/cuda/remove.cu b/testing/cuda/remove.cu
index 3509cd31b..0331c24b8 100644
--- a/testing/cuda/remove.cu
+++ b/testing/cuda/remove.cu
@@ -3,6 +3,7 @@
 #include <thrust/execution_policy.h>
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator, typename T, typename Iterator2>
 __global__
 void remove_kernel(ExecutionPolicy exec, Iterator first, Iterator last, T val, Iterator2 result)
@@ -49,6 +50,7 @@ void remove_copy_if_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last
 {
   *result_end = thrust::remove_copy_if(exec, first, last, stencil_first, result, pred);
 }
+#endif
 
 
 template<typename T>
@@ -69,6 +71,7 @@ struct is_true
 };
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy>
 void TestRemoveDevice(ExecutionPolicy exec)
 {
@@ -328,6 +331,7 @@ void TestRemoveCopyIfStencilDeviceDevice()
   TestRemoveCopyIfStencilDevice(thrust::device);
 }
 DECLARE_UNITTEST(TestRemoveCopyIfStencilDeviceDevice);
+#endif
 
 
 void TestRemoveCudaStreams()
diff --git a/testing/cuda/remove.mk b/testing/cuda/remove.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/remove.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/cuda/replace.cu b/testing/cuda/replace.cu
index 24a03b2d5..bb8b7faa9 100644
--- a/testing/cuda/replace.cu
+++ b/testing/cuda/replace.cu
@@ -10,6 +10,7 @@ struct less_than_five
 };
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator, typename T1, typename T2>
 __global__
 void replace_kernel(ExecutionPolicy exec, Iterator first, Iterator last, T1 old_value, T2 new_value)
@@ -258,6 +259,7 @@ void TestReplaceCopyIfStencilDeviceDevice()
   TestReplaceCopyIfStencilDevice(thrust::device);
 }
 DECLARE_UNITTEST(TestReplaceCopyIfStencilDeviceDevice);
+#endif
 
 
 void TestReplaceCudaStreams()
diff --git a/testing/cuda/replace.mk b/testing/cuda/replace.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/replace.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/cuda/reverse.cu b/testing/cuda/reverse.cu
index 4f6dfab08..f6599ed61 100644
--- a/testing/cuda/reverse.cu
+++ b/testing/cuda/reverse.cu
@@ -3,6 +3,7 @@
 #include <thrust/execution_policy.h>
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator>
 __global__
 void reverse_kernel(ExecutionPolicy exec, Iterator first, Iterator last)
@@ -82,6 +83,7 @@ void TestReverseCopyDeviceDevice()
   TestReverseCopyDevice(thrust::device);
 }
 DECLARE_UNITTEST(TestReverseCopyDeviceDevice);
+#endif
 
 
 void TestReverseCudaStreams()
diff --git a/testing/cuda/reverse.mk b/testing/cuda/reverse.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/reverse.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/cuda/scan.cu b/testing/cuda/scan.cu
index e67470cab..5a19798cd 100644
--- a/testing/cuda/scan.cu
+++ b/testing/cuda/scan.cu
@@ -4,6 +4,7 @@
 #include <thrust/functional.h>
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2>
 __global__
 void inclusive_scan_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 result)
@@ -116,6 +117,7 @@ struct TestScanDeviceDevice
   }
 };
 VariableUnitTest<TestScanDeviceDevice, IntegralTypes> TestScanDeviceDeviceInstance;
+#endif
 
 
 void TestScanCudaStreams()
@@ -212,3 +214,48 @@ void TestScanCudaStreams()
 }
 DECLARE_UNITTEST(TestScanCudaStreams);
 
+template <typename T>
+struct const_ref_plus_mod3
+{
+    T * table;
+
+    const_ref_plus_mod3(T * table) : table(table) {}
+
+    __host__ __device__
+    const T& operator()(T a, T b)
+    {
+        return table[(int) (a + b)];
+    }
+};
+
+static void TestInclusiveScanWithConstAccumulator(void)
+{
+    // add numbers modulo 3 with external lookup table
+    thrust::device_vector<int> data(7);
+    data[0] = 0;
+    data[1] = 1;
+    data[2] = 2;
+    data[3] = 1;
+    data[4] = 2;
+    data[5] = 0;
+    data[6] = 1;
+
+    thrust::device_vector<int> table(6);
+    table[0] = 0;
+    table[1] = 1;
+    table[2] = 2;
+    table[3] = 0;
+    table[4] = 1;
+    table[5] = 2;
+
+    thrust::inclusive_scan(data.begin(), data.end(), data.begin(), const_ref_plus_mod3<int>(thrust::raw_pointer_cast(&table[0])));
+    
+    ASSERT_EQUAL(data[0], 0);
+    ASSERT_EQUAL(data[1], 1);
+    ASSERT_EQUAL(data[2], 0);
+    ASSERT_EQUAL(data[3], 1);
+    ASSERT_EQUAL(data[4], 0);
+    ASSERT_EQUAL(data[5], 0);
+    ASSERT_EQUAL(data[6], 1);
+}
+DECLARE_UNITTEST(TestInclusiveScanWithConstAccumulator);
diff --git a/testing/cuda/scan.mk b/testing/cuda/scan.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/scan.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/cuda/scan_by_key.cu b/testing/cuda/scan_by_key.cu
index e65560edf..0fea161d7 100644
--- a/testing/cuda/scan_by_key.cu
+++ b/testing/cuda/scan_by_key.cu
@@ -4,6 +4,7 @@
 #include <thrust/execution_policy.h>
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3>
 __global__
 void inclusive_scan_by_key_kernel(ExecutionPolicy exec, Iterator1 keys_first, Iterator1 keys_last, Iterator2 values_first, Iterator3 result)
@@ -78,7 +79,7 @@ void TestScanByKeyDevice(ExecutionPolicy exec)
   }
   ASSERT_EQUAL(d_output, h_output);
   
-  // in-place scans
+  // in-place scans: in/out values aliasing
   h_output = h_vals;
   d_output = d_vals;
   thrust::inclusive_scan_by_key(h_keys.begin(), h_keys.end(), h_output.begin(), h_output.begin());
@@ -98,6 +99,24 @@ void TestScanByKeyDevice(ExecutionPolicy exec)
     ASSERT_EQUAL(cudaSuccess, err);
   }
   ASSERT_EQUAL(d_output, h_output);
+
+  // in-place scans: keys/values aliasing
+  thrust::inclusive_scan_by_key(h_keys.begin(), h_keys.end(), h_vals.begin(), h_output.begin());
+  inclusive_scan_by_key_kernel<<<1,1>>>(exec, d_keys.begin(), d_keys.end(), d_vals.begin(), d_keys.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+  ASSERT_EQUAL(d_keys, h_output);
+
+  d_keys = h_keys;
+  thrust::exclusive_scan_by_key(h_keys.begin(), h_keys.end(), h_vals.begin(), h_output.begin(), 11);
+  exclusive_scan_by_key_kernel<<<1,1>>>(exec, d_keys.begin(), d_keys.end(), d_vals.begin(), d_keys.begin(), 11);
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+  ASSERT_EQUAL(d_keys, h_output);
 }
 
 
@@ -113,6 +132,7 @@ void TestScanByKeyDeviceDevice()
   TestScanByKeyDevice(thrust::device);
 }
 DECLARE_UNITTEST(TestScanByKeyDeviceDevice);
+#endif
 
 
 void TestInclusiveScanByKeyCudaStreams()
diff --git a/testing/cuda/scan_by_key.mk b/testing/cuda/scan_by_key.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/scan_by_key.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/cuda/scatter.cu b/testing/cuda/scatter.cu
index 52bd9755f..92e7f342a 100644
--- a/testing/cuda/scatter.cu
+++ b/testing/cuda/scatter.cu
@@ -3,6 +3,7 @@
 #include <thrust/execution_policy.h>
 #include <algorithm>
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3>
 __global__
 void scatter_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 map_first, Iterator3 result)
@@ -112,6 +113,7 @@ void TestScatterIfDeviceDevice()
   TestScatterIfDevice(thrust::device);
 }
 DECLARE_UNITTEST(TestScatterIfDeviceDevice);
+#endif
 
 
 void TestScatterCudaStreams()
diff --git a/testing/cuda/scatter.mk b/testing/cuda/scatter.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/scatter.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/cuda/sequence.cu b/testing/cuda/sequence.cu
index acbe09848..16b2d799b 100644
--- a/testing/cuda/sequence.cu
+++ b/testing/cuda/sequence.cu
@@ -3,6 +3,7 @@
 #include <thrust/execution_policy.h>
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator>
 __global__
 void sequence_kernel(ExecutionPolicy exec, Iterator first, Iterator last)
@@ -80,6 +81,7 @@ void TestSequenceDeviceDevice()
   TestSequenceDevice(thrust::device);
 }
 DECLARE_UNITTEST(TestSequenceDeviceDevice);
+#endif
 
 void TestSequenceCudaStreams()
 {
diff --git a/testing/cuda/sequence.mk b/testing/cuda/sequence.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/sequence.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/cuda/set_difference.cu b/testing/cuda/set_difference.cu
index d87db42d9..bd9da131f 100644
--- a/testing/cuda/set_difference.cu
+++ b/testing/cuda/set_difference.cu
@@ -3,6 +3,7 @@
 #include <thrust/execution_policy.h>
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3, typename Iterator4>
 __global__
 void set_difference_kernel(ExecutionPolicy exec, Iterator1 first1, Iterator1 last1, Iterator2 first2, Iterator2 last2, Iterator3 result1, Iterator4 result2)
@@ -52,6 +53,7 @@ void TestSetDifferenceDeviceDevice()
   TestSetDifferenceDevice(thrust::device);
 }
 DECLARE_UNITTEST(TestSetDifferenceDeviceDevice);
+#endif
 
 
 void TestSetDifferenceCudaStreams()
diff --git a/testing/cuda/set_difference.mk b/testing/cuda/set_difference.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/set_difference.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/cuda/set_difference_by_key.cu b/testing/cuda/set_difference_by_key.cu
index 31d2860b0..2c32466f1 100644
--- a/testing/cuda/set_difference_by_key.cu
+++ b/testing/cuda/set_difference_by_key.cu
@@ -4,6 +4,7 @@
 #include <thrust/execution_policy.h>
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3, typename Iterator4, typename Iterator5, typename Iterator6, typename Iterator7>
 __global__
 void set_difference_by_key_kernel(ExecutionPolicy exec,
@@ -82,6 +83,7 @@ void TestSetDifferenceByKeyDeviceDevice()
   TestSetDifferenceByKeyDevice(thrust::device);
 }
 DECLARE_UNITTEST(TestSetDifferenceByKeyDeviceDevice);
+#endif
 
 
 void TestSetDifferenceByKeyCudaStreams()
diff --git a/testing/cuda/set_difference_by_key.mk b/testing/cuda/set_difference_by_key.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/set_difference_by_key.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/cuda/set_intersection.cu b/testing/cuda/set_intersection.cu
index a57bc1b2a..2bb30ea87 100644
--- a/testing/cuda/set_intersection.cu
+++ b/testing/cuda/set_intersection.cu
@@ -6,6 +6,7 @@
 #include <thrust/iterator/discard_iterator.h>
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3, typename Iterator4>
 __global__
 void set_intersection_kernel(ExecutionPolicy exec, Iterator1 first1, Iterator1 last1,
@@ -59,7 +60,16 @@ void TestSetIntersectionDeviceDevice()
 DECLARE_UNITTEST(TestSetIntersectionDeviceDevice);
 
 
-void TestSetIntersectionCudaStreams()
+void TestSetIntersectionDeviceNoSync()
+{
+  TestSetIntersectionDevice(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestSetIntersectionDeviceNoSync);
+#endif
+
+
+template<typename ExecutionPolicy>
+void TestSetIntersectionCudaStreams(ExecutionPolicy policy)
 {
   typedef thrust::device_vector<int> Vector;
   typedef Vector::iterator Iterator;
@@ -77,7 +87,9 @@ void TestSetIntersectionCudaStreams()
   cudaStream_t s;
   cudaStreamCreate(&s);
 
-  Iterator end = thrust::set_intersection(thrust::cuda::par.on(s),
+  auto streampolicy = policy.on(s);
+
+  Iterator end = thrust::set_intersection(streampolicy,
                                           a.begin(), a.end(),
                                           b.begin(), b.end(),
                                           result.begin());
@@ -88,5 +100,17 @@ void TestSetIntersectionCudaStreams()
 
   cudaStreamDestroy(s);
 }
-DECLARE_UNITTEST(TestSetIntersectionCudaStreams);
+
+void TestSetIntersectionCudaStreamsSync()
+{
+  TestSetIntersectionCudaStreams(thrust::cuda::par);
+}
+DECLARE_UNITTEST(TestSetIntersectionCudaStreamsSync);
+
+
+void TestSetIntersectionCudaStreamsNoSync()
+{
+  TestSetIntersectionCudaStreams(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestSetIntersectionCudaStreamsNoSync);
 
diff --git a/testing/cuda/set_intersection.mk b/testing/cuda/set_intersection.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/set_intersection.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/cuda/set_intersection_by_key.cu b/testing/cuda/set_intersection_by_key.cu
index a19f82221..fed6cb6f6 100644
--- a/testing/cuda/set_intersection_by_key.cu
+++ b/testing/cuda/set_intersection_by_key.cu
@@ -3,6 +3,7 @@
 #include <thrust/execution_policy.h>
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3, typename Iterator4, typename Iterator5, typename Iterator6>
 __global__
 void set_intersection_by_key_kernel(ExecutionPolicy exec,
@@ -73,7 +74,16 @@ void TestSetIntersectionByKeyDeviceDevice()
 DECLARE_UNITTEST(TestSetIntersectionByKeyDeviceDevice);
 
 
-void TestSetIntersectionByKeyCudaStreams()
+void TestSetIntersectionByKeyDeviceNoSync()
+{
+  TestSetIntersectionByKeyDevice(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestSetIntersectionByKeyDeviceNoSync);
+#endif
+
+
+template<typename ExecutionPolicy>
+void TestSetIntersectionByKeyCudaStreams(ExecutionPolicy policy)
 {
   typedef thrust::device_vector<int> Vector;
   typedef Vector::iterator Iterator;
@@ -95,8 +105,10 @@ void TestSetIntersectionByKeyCudaStreams()
   cudaStream_t s;
   cudaStreamCreate(&s);
 
+  auto streampolicy = policy.on(s);
+
   thrust::pair<Iterator,Iterator> end =
-    thrust::set_intersection_by_key(thrust::cuda::par.on(s),
+    thrust::set_intersection_by_key(streampolicy,
                                     a_key.begin(), a_key.end(),
                                     b_key.begin(), b_key.end(),
                                     a_val.begin(),
@@ -111,5 +123,17 @@ void TestSetIntersectionByKeyCudaStreams()
 
   cudaStreamDestroy(s);
 }
-DECLARE_UNITTEST(TestSetIntersectionByKeyCudaStreams);
+
+void TestSetIntersectionByKeyCudaStreamsSync()
+{
+  TestSetIntersectionByKeyCudaStreams(thrust::cuda::par);
+}
+DECLARE_UNITTEST(TestSetIntersectionByKeyCudaStreamsSync);
+
+
+void TestSetIntersectionByKeyCudaStreamsNoSync()
+{
+  TestSetIntersectionByKeyCudaStreams(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestSetIntersectionByKeyCudaStreamsNoSync);
 
diff --git a/testing/cuda/set_intersection_by_key.mk b/testing/cuda/set_intersection_by_key.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/set_intersection_by_key.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/cuda/set_symmetric_difference.cu b/testing/cuda/set_symmetric_difference.cu
index 34969886e..43fc0e993 100644
--- a/testing/cuda/set_symmetric_difference.cu
+++ b/testing/cuda/set_symmetric_difference.cu
@@ -3,6 +3,7 @@
 #include <thrust/execution_policy.h>
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3, typename Iterator4>
 __global__
 void set_symmetric_difference_kernel(ExecutionPolicy exec,
@@ -59,6 +60,7 @@ void TestSetSymmetricDifferenceDeviceDevice()
   TestSetSymmetricDifferenceDevice(thrust::device);
 }
 DECLARE_UNITTEST(TestSetSymmetricDifferenceDeviceDevice);
+#endif
 
 
 void TestSetSymmetricDifferenceCudaStreams()
diff --git a/testing/cuda/set_symmetric_difference.mk b/testing/cuda/set_symmetric_difference.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/set_symmetric_difference.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/cuda/set_symmetric_difference_by_key.cu b/testing/cuda/set_symmetric_difference_by_key.cu
index 3a6c68ce9..7e7adba5e 100644
--- a/testing/cuda/set_symmetric_difference_by_key.cu
+++ b/testing/cuda/set_symmetric_difference_by_key.cu
@@ -3,6 +3,7 @@
 #include <thrust/execution_policy.h>
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3, typename Iterator4, typename Iterator5, typename Iterator6, typename Iterator7>
 __global__
 void set_symmetric_difference_by_key_kernel(ExecutionPolicy exec,
@@ -74,6 +75,7 @@ void TestSetSymmetricDifferenceByKeyDeviceDevice()
   TestSetSymmetricDifferenceByKeyDevice(thrust::device);
 }
 DECLARE_UNITTEST(TestSetSymmetricDifferenceByKeyDeviceDevice);
+#endif
 
 
 void TestSetSymmetricDifferenceByKeyCudaStreams()
diff --git a/testing/cuda/set_symmetric_difference_by_key.mk b/testing/cuda/set_symmetric_difference_by_key.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/set_symmetric_difference_by_key.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/cuda/set_union.cu b/testing/cuda/set_union.cu
index fb5b543e1..058f0e700 100644
--- a/testing/cuda/set_union.cu
+++ b/testing/cuda/set_union.cu
@@ -3,6 +3,7 @@
 #include <thrust/execution_policy.h>
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3, typename Iterator4>
 __global__
 void set_union_kernel(ExecutionPolicy exec,
@@ -59,6 +60,7 @@ void TestSetUnionDeviceDevice()
   TestSetUnionDevice(thrust::device);
 }
 DECLARE_UNITTEST(TestSetUnionDeviceDevice);
+#endif
 
 
 void TestSetUnionCudaStreams()
diff --git a/testing/cuda/set_union.mk b/testing/cuda/set_union.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/set_union.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/cuda/set_union_by_key.cu b/testing/cuda/set_union_by_key.cu
index 1be3d9302..013ebe11b 100644
--- a/testing/cuda/set_union_by_key.cu
+++ b/testing/cuda/set_union_by_key.cu
@@ -3,6 +3,7 @@
 #include <thrust/execution_policy.h>
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3, typename Iterator4, typename Iterator5, typename Iterator6, typename Iterator7>
 __global__
 void set_union_by_key_kernel(ExecutionPolicy exec,
@@ -73,6 +74,7 @@ void TestSetUnionByKeyDeviceDevice()
   TestSetUnionByKeyDevice(thrust::device);
 }
 DECLARE_UNITTEST(TestSetUnionByKeyDeviceDevice);
+#endif
 
 
 void TestSetUnionByKeyCudaStreams()
diff --git a/testing/cuda/set_union_by_key.mk b/testing/cuda/set_union_by_key.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/set_union_by_key.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/cuda/sort.cu b/testing/cuda/sort.cu
index 7f3d6413c..c3d5ff2bc 100644
--- a/testing/cuda/sort.cu
+++ b/testing/cuda/sort.cu
@@ -4,19 +4,6 @@
 #include <thrust/execution_policy.h>
 
 
-template<typename ExecutionPolicy, typename Iterator, typename Compare, typename Iterator2>
-__global__
-void sort_kernel(ExecutionPolicy exec, Iterator first, Iterator last, Compare comp, Iterator2 is_supported)
-{
-#if (__CUDA_ARCH__ >= 200)
-  *is_supported = true;
-  thrust::sort(exec, first, last, comp);
-#else
-  *is_supported = false;
-#endif
-}
-
-
 template<typename T>
 struct my_less
 {
@@ -28,25 +15,29 @@ struct my_less
 };
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
+template<typename ExecutionPolicy, typename Iterator, typename Compare>
+__global__
+void sort_kernel(ExecutionPolicy exec, Iterator first, Iterator last, Compare comp)
+{
+  thrust::sort(exec, first, last, comp);
+}
+
+
 template<typename T, typename ExecutionPolicy, typename Compare>
 void TestComparisonSortDevice(ExecutionPolicy exec, const size_t n, Compare comp)
 {
   thrust::host_vector<T>   h_data = unittest::random_integers<T>(n);
   thrust::device_vector<T> d_data = h_data;
   
-  thrust::device_vector<bool> is_supported(1);
-
-  sort_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), comp, is_supported.begin());
+  sort_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), comp);
   cudaError_t const err = cudaDeviceSynchronize();
   ASSERT_EQUAL(cudaSuccess, err);
 
 
-  if(is_supported[0])
-  {
-    thrust::sort(h_data.begin(), h_data.end(), comp);
-    
-    ASSERT_EQUAL(h_data, d_data);
-  }
+  thrust::sort(h_data.begin(), h_data.end(), comp);
+
+  ASSERT_EQUAL(h_data, d_data);
 };
 
 
@@ -111,6 +102,7 @@ VariableUnitTest<
   TestSortDeviceDevice,
   unittest::type_list<unittest::int8_t,unittest::int32_t>
 > TestSortDeviceDeviceInstance;
+#endif
 
 
 void TestSortCudaStreams()
@@ -163,7 +155,7 @@ void TestComparisonSortCudaStreams()
   cudaStreamSynchronize(s);
 
   ASSERT_EQUAL(true, thrust::is_sorted(keys.begin(), keys.end(), my_less<int>()));
-                      
+
   cudaStreamDestroy(s);
 }
 DECLARE_UNITTEST(TestComparisonSortCudaStreams);
diff --git a/testing/cuda/sort.mk b/testing/cuda/sort.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/sort.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/cuda/sort_by_key.cu b/testing/cuda/sort_by_key.cu
index 1e848879b..ee2b44ea0 100644
--- a/testing/cuda/sort_by_key.cu
+++ b/testing/cuda/sort_by_key.cu
@@ -4,19 +4,6 @@
 #include <thrust/functional.h>
 
 
-template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Compare, typename Iterator3>
-__global__
-void sort_by_key_kernel(ExecutionPolicy exec, Iterator1 keys_first, Iterator1 keys_last, Iterator2 values_first, Compare comp, Iterator3 is_supported)
-{
-#if (__CUDA_ARCH__ >= 200)
-  *is_supported = true;
-  thrust::sort_by_key(exec, keys_first, keys_last, values_first, comp);
-#else
-  *is_supported = false;
-#endif
-}
-
-
 template<typename T>
 struct my_less
 {
@@ -28,6 +15,15 @@ struct my_less
 };
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Compare>
+__global__
+void sort_by_key_kernel(ExecutionPolicy exec, Iterator1 keys_first, Iterator1 keys_last, Iterator2 values_first, Compare comp)
+{
+  thrust::sort_by_key(exec, keys_first, keys_last, values_first, comp);
+}
+
+
 template<typename T, typename ExecutionPolicy, typename Compare>
 void TestComparisonSortByKeyDevice(ExecutionPolicy exec, const size_t n, Compare comp)
 {
@@ -36,19 +32,15 @@ void TestComparisonSortByKeyDevice(ExecutionPolicy exec, const size_t n, Compare
 
   thrust::host_vector<T>   h_values = h_keys;
   thrust::device_vector<T> d_values = d_keys;
-  
-  thrust::device_vector<bool> is_supported(1);
-  sort_by_key_kernel<<<1,1>>>(exec, d_keys.begin(), d_keys.end(), d_values.begin(), comp, is_supported.begin());
+
+  sort_by_key_kernel<<<1,1>>>(exec, d_keys.begin(), d_keys.end(), d_values.begin(), comp);
   cudaError_t const err = cudaDeviceSynchronize();
   ASSERT_EQUAL(cudaSuccess, err);
 
-  if(is_supported[0])
-  {
-    thrust::sort_by_key(h_keys.begin(), h_keys.end(), h_values.begin(), comp);
-    
-    ASSERT_EQUAL(h_keys, d_keys);
-    ASSERT_EQUAL(h_values, d_values);
-  }
+  thrust::sort_by_key(h_keys.begin(), h_keys.end(), h_values.begin(), comp);
+
+  ASSERT_EQUAL(h_keys, d_keys);
+  ASSERT_EQUAL(h_values, d_values);
 };
 
 
@@ -113,6 +105,7 @@ VariableUnitTest<
   TestSortByKeyDeviceDevice,
   unittest::type_list<unittest::int8_t,unittest::int32_t>
 > TestSortByKeyDeviceDeviceInstance;
+#endif
 
 
 void TestComparisonSortByKeyCudaStreams()
@@ -139,7 +132,7 @@ void TestComparisonSortByKeyCudaStreams()
 
   ASSERT_EQUAL(true, thrust::is_sorted(keys.begin(), keys.end()));
   ASSERT_EQUAL(true, thrust::is_sorted(vals.begin(), vals.end()));
-                      
+
   cudaStreamDestroy(s);
 }
 DECLARE_UNITTEST(TestComparisonSortByKeyCudaStreams);
@@ -169,7 +162,7 @@ void TestSortByKeyCudaStreams()
 
   ASSERT_EQUAL(true, thrust::is_sorted(keys.begin(), keys.end()));
   ASSERT_EQUAL(true, thrust::is_sorted(vals.begin(), vals.end()));
-                      
+
   cudaStreamDestroy(s);
 }
 DECLARE_UNITTEST(TestSortByKeyCudaStreams);
diff --git a/testing/cuda/sort_by_key.mk b/testing/cuda/sort_by_key.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/sort_by_key.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/cuda/stream_legacy.cu b/testing/cuda/stream_legacy.cu
new file mode 100644
index 000000000..51c82a096
--- /dev/null
+++ b/testing/cuda/stream_legacy.cu
@@ -0,0 +1,21 @@
+#include <unittest/unittest.h>
+#include <thrust/execution_policy.h>
+#include <thrust/system/cuda/detail/util.h>
+
+#include <thread>
+
+void verify_stream()
+{
+  auto exec = thrust::device;
+  auto stream = thrust::cuda_cub::stream(exec);
+  ASSERT_EQUAL(stream, cudaStreamLegacy);
+}
+
+void TestLegacyDefaultStream()
+{
+  verify_stream();
+
+  std::thread t(verify_stream);
+  t.join();
+}
+DECLARE_UNITTEST(TestLegacyDefaultStream);
diff --git a/testing/cuda/stream_per_thread.cmake b/testing/cuda/stream_per_thread.cmake
new file mode 100644
index 000000000..2cea2f938
--- /dev/null
+++ b/testing/cuda/stream_per_thread.cmake
@@ -0,0 +1,13 @@
+# This test should always use per-thread streams on NVCC.
+set_target_properties(${test_target} PROPERTIES
+  COMPILE_OPTIONS
+    $<$<AND:$<COMPILE_LANGUAGE:CUDA>,$<CUDA_COMPILER_ID:NVIDIA>>:--default-stream=per-thread>
+)
+
+thrust_fix_clang_nvcc_build_for(${test_target})
+
+# NVC++ does not have an equivalent option, and will always
+# use the global stream by default.
+if (CMAKE_CUDA_COMPILER_ID STREQUAL "Feta")
+  set_tests_properties(${test_target} PROPERTIES WILL_FAIL ON)
+endif()
diff --git a/testing/cuda/stream_per_thread.cu b/testing/cuda/stream_per_thread.cu
new file mode 100644
index 000000000..ef126e78a
--- /dev/null
+++ b/testing/cuda/stream_per_thread.cu
@@ -0,0 +1,21 @@
+#include <unittest/unittest.h>
+#include <thrust/execution_policy.h>
+#include <thrust/system/cuda/detail/util.h>
+
+#include <thread>
+
+void verify_stream()
+{
+  auto exec = thrust::device;
+  auto stream = thrust::cuda_cub::stream(exec);
+  ASSERT_EQUAL(stream, cudaStreamPerThread);
+}
+
+void TestPerThreadDefaultStream()
+{
+  verify_stream();
+
+  std::thread t(verify_stream);
+  t.join();
+}
+DECLARE_UNITTEST(TestPerThreadDefaultStream);
diff --git a/testing/cuda/stream_per_thread.mk b/testing/cuda/stream_per_thread.mk
new file mode 100644
index 000000000..da9adfe1b
--- /dev/null
+++ b/testing/cuda/stream_per_thread.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += --default-stream per-thread
diff --git a/testing/cuda/swap_ranges.cu b/testing/cuda/swap_ranges.cu
index e2392bbe2..ebc396e83 100644
--- a/testing/cuda/swap_ranges.cu
+++ b/testing/cuda/swap_ranges.cu
@@ -3,6 +3,7 @@
 #include <thrust/execution_policy.h>
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2>
 __global__
 void swap_ranges_kernel(ExecutionPolicy exec, Iterator1 first1, Iterator1 last1, Iterator2 first2)
@@ -50,6 +51,7 @@ void TestSwapRangesDeviceDevice()
   TestSwapRangesDevice(thrust::device);
 }
 DECLARE_UNITTEST(TestSwapRangesDeviceDevice);
+#endif
 
 void TestSwapRangesCudaStreams()
 {
diff --git a/testing/cuda/swap_ranges.mk b/testing/cuda/swap_ranges.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/swap_ranges.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/cuda/tabulate.cu b/testing/cuda/tabulate.cu
index 564d85e7e..b449fb7cc 100644
--- a/testing/cuda/tabulate.cu
+++ b/testing/cuda/tabulate.cu
@@ -4,6 +4,7 @@
 #include <thrust/execution_policy.h>
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator, typename Function>
 __global__
 void tabulate_kernel(ExecutionPolicy exec, Iterator first, Iterator last, Function f)
@@ -69,6 +70,7 @@ void TestTabulateDeviceDevice()
   TestTabulateDevice(thrust::device);
 }
 DECLARE_UNITTEST(TestTabulateDeviceDevice);
+#endif
 
 void TestTabulateCudaStreams()
 {
diff --git a/testing/cuda/tabulate.mk b/testing/cuda/tabulate.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/tabulate.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/cuda/transform.cu b/testing/cuda/transform.cu
index fa0358e57..7739089e6 100644
--- a/testing/cuda/transform.cu
+++ b/testing/cuda/transform.cu
@@ -3,6 +3,7 @@
 #include <thrust/execution_policy.h>
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Function, typename Iterator3>
 __global__
 void transform_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 result1, Function f, Iterator3 result2)
@@ -270,6 +271,7 @@ void TestTransformIfBinaryDeviceDevice()
   TestTransformIfBinaryDevice(thrust::device);
 }
 DECLARE_UNITTEST(TestTransformIfBinaryDeviceDevice);
+#endif
 
 void TestTransformUnaryCudaStreams()
 {
diff --git a/testing/cuda/transform.mk b/testing/cuda/transform.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/transform.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/cuda/transform_reduce.cu b/testing/cuda/transform_reduce.cu
index dcc8f646b..c55aa66e7 100644
--- a/testing/cuda/transform_reduce.cu
+++ b/testing/cuda/transform_reduce.cu
@@ -3,6 +3,7 @@
 #include <thrust/execution_policy.h>
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator1, typename Function1, typename T, typename Function2, typename Iterator2>
 __global__
 void transform_reduce_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Function1 f1, T init, Function2 f2, Iterator2 result)
@@ -44,6 +45,7 @@ void TestTransformReduceDeviceDevice()
   TestTransformReduceDevice(thrust::device);
 }
 DECLARE_UNITTEST(TestTransformReduceDeviceDevice);
+#endif
 
 
 void TestTransformReduceCudaStreams()
diff --git a/testing/cuda/transform_reduce.mk b/testing/cuda/transform_reduce.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/transform_reduce.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/cuda/transform_scan.cu b/testing/cuda/transform_scan.cu
index e629fcdff..de0d1524f 100644
--- a/testing/cuda/transform_scan.cu
+++ b/testing/cuda/transform_scan.cu
@@ -3,6 +3,7 @@
 #include <thrust/execution_policy.h>
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Function1, typename Function2, typename Iterator3>
 __global__
 void transform_inclusive_scan_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 result1, Function1 f1, Function2 f2, Iterator3 result2)
@@ -115,6 +116,7 @@ void TestTransformScanDeviceDevice()
   TestTransformScanDevice(thrust::device);
 }
 DECLARE_UNITTEST(TestTransformScanDeviceDevice);
+#endif
 
 
 void TestTransformScanCudaStreams()
@@ -184,3 +186,30 @@ void TestTransformScanCudaStreams()
 }
 DECLARE_UNITTEST(TestTransformScanCudaStreams);
 
+void TestTransformScanConstAccumulator()
+{
+  typedef thrust::device_vector<int> Vector;
+  typedef Vector::value_type T;
+
+  Vector::iterator iter;
+
+  Vector input(5);
+  Vector reference(5);
+  Vector output(5);
+
+  input[0] = 1;
+  input[1] = 3;
+  input[2] = -2;
+  input[3] = 4;
+  input[4] = -5;
+
+  thrust::transform_inclusive_scan(input.begin(),
+                                   input.end(),
+                                   output.begin(),
+                                   thrust::identity<T>(),
+                                   thrust::plus<T>());
+  thrust::inclusive_scan(input.begin(), input.end(), reference.begin(), thrust::plus<T>());
+
+  ASSERT_EQUAL(output, reference);
+}
+DECLARE_UNITTEST(TestTransformScanConstAccumulator);
diff --git a/testing/cuda/transform_scan.mk b/testing/cuda/transform_scan.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/transform_scan.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/cuda/uninitialized_copy.cu b/testing/cuda/uninitialized_copy.cu
index 31feb0716..735e2dac3 100644
--- a/testing/cuda/uninitialized_copy.cu
+++ b/testing/cuda/uninitialized_copy.cu
@@ -3,6 +3,7 @@
 #include <thrust/execution_policy.h>
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2>
 __global__
 void uninitialized_copy_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 result)
@@ -45,6 +46,7 @@ void TestUninitializedCopyDeviceDevice()
   TestUninitializedCopyDevice(thrust::device);
 }
 DECLARE_UNITTEST(TestUninitializedCopyDeviceDevice);
+#endif
 
 
 void TestUninitializedCopyCudaStreams()
@@ -74,6 +76,7 @@ void TestUninitializedCopyCudaStreams()
 DECLARE_UNITTEST(TestUninitializedCopyCudaStreams);
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator1, typename Size, typename Iterator2>
 __global__
 void uninitialized_copy_n_kernel(ExecutionPolicy exec, Iterator1 first, Size n, Iterator2 result)
@@ -116,6 +119,7 @@ void TestUninitializedCopyNDeviceDevice()
   TestUninitializedCopyNDevice(thrust::device);
 }
 DECLARE_UNITTEST(TestUninitializedCopyNDeviceDevice);
+#endif
 
 
 void TestUninitializedCopyNCudaStreams()
diff --git a/testing/cuda/uninitialized_copy.mk b/testing/cuda/uninitialized_copy.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/uninitialized_copy.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/cuda/uninitialized_fill.cu b/testing/cuda/uninitialized_fill.cu
index fd7477347..bb222cf02 100644
--- a/testing/cuda/uninitialized_fill.cu
+++ b/testing/cuda/uninitialized_fill.cu
@@ -3,6 +3,7 @@
 #include <thrust/execution_policy.h>
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator, typename T>
 __global__
 void uninitialized_fill_kernel(ExecutionPolicy exec, Iterator first, Iterator last, T val)
@@ -90,6 +91,7 @@ void TestUninitializedFillDeviceDevice()
   TestUninitializedFillDevice(thrust::device);
 }
 DECLARE_UNITTEST(TestUninitializedFillDeviceDevice);
+#endif
 
 
 void TestUninitializedFillCudaStreams()
@@ -119,6 +121,7 @@ void TestUninitializedFillCudaStreams()
 DECLARE_UNITTEST(TestUninitializedFillCudaStreams);
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator1, typename Size, typename T, typename Iterator2>
 __global__
 void uninitialized_fill_n_kernel(ExecutionPolicy exec, Iterator1 first, Size n, T val, Iterator2 result)
@@ -163,9 +166,6 @@ void TestUninitializedFillNDevice(ExecutionPolicy exec)
     ASSERT_EQUAL(cudaSuccess, err);
   }
 
-  cudaError_t const err = cudaDeviceSynchronize();
-  ASSERT_EQUAL(cudaSuccess, err);
-
   iter = iter_vec[0];
   
   ASSERT_EQUAL(v[0], exemplar);
@@ -223,6 +223,7 @@ void TestUninitializedFillNDeviceDevice()
   TestUninitializedFillNDevice(thrust::device);
 }
 DECLARE_UNITTEST(TestUninitializedFillNDeviceDevice);
+#endif
 
 
 void TestUninitializedFillNCudaStreams()
diff --git a/testing/cuda/uninitialized_fill.mk b/testing/cuda/uninitialized_fill.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/uninitialized_fill.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/cuda/unique.cu b/testing/cuda/unique.cu
index c0dc7973d..136ba76fd 100644
--- a/testing/cuda/unique.cu
+++ b/testing/cuda/unique.cu
@@ -3,6 +3,15 @@
 #include <thrust/execution_policy.h>
 
 
+template<typename T>
+struct is_equal_div_10_unique
+{
+  __host__ __device__
+  bool operator()(const T x, const T& y) const { return ((int) x / 10) == ((int) y / 10); }
+};
+
+
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2>
 __global__
 void unique_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 result)
@@ -19,14 +28,6 @@ void unique_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Binary
 }
 
 
-template<typename T>
-struct is_equal_div_10_unique
-{
-  __host__ __device__
-  bool operator()(const T x, const T& y) const { return ((int) x / 10) == ((int) y / 10); }
-};
-
-
 template<typename ExecutionPolicy>
 void TestUniqueDevice(ExecutionPolicy exec)
 {
@@ -94,7 +95,16 @@ void TestUniqueDeviceDevice()
 DECLARE_UNITTEST(TestUniqueDeviceDevice);
 
 
-void TestUniqueCudaStreams()
+void TestUniqueDeviceNoSync()
+{
+  TestUniqueDevice(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestUniqueDeviceNoSync);
+#endif
+
+
+template<typename ExecutionPolicy>
+void TestUniqueCudaStreams(ExecutionPolicy policy)
 {
   typedef thrust::device_vector<int> Vector;
   typedef Vector::value_type T;
@@ -116,8 +126,10 @@ void TestUniqueCudaStreams()
 
   cudaStream_t s;
   cudaStreamCreate(&s);
+
+  auto streampolicy = policy.on(s);
   
-  new_last = thrust::unique(thrust::cuda::par.on(s), data.begin(), data.end());
+  new_last = thrust::unique(streampolicy, data.begin(), data.end());
   cudaStreamSynchronize(s);
 
   ASSERT_EQUAL(new_last - data.begin(), 7);
@@ -129,7 +141,7 @@ void TestUniqueCudaStreams()
   ASSERT_EQUAL(data[5], 31);
   ASSERT_EQUAL(data[6], 37);
 
-  new_last = thrust::unique(thrust::cuda::par.on(s), data.begin(), new_last, is_equal_div_10_unique<T>());
+  new_last = thrust::unique(streampolicy, data.begin(), new_last, is_equal_div_10_unique<T>());
   cudaStreamSynchronize(s);
 
   ASSERT_EQUAL(new_last - data.begin(), 3);
@@ -139,9 +151,22 @@ void TestUniqueCudaStreams()
 
   cudaStreamDestroy(s);
 }
-DECLARE_UNITTEST(TestUniqueCudaStreams);
 
+void TestUniqueCudaStreamsSync()
+{
+  TestUniqueCudaStreams(thrust::cuda::par);
+}
+DECLARE_UNITTEST(TestUniqueCudaStreamsSync);
 
+
+void TestUniqueCudaStreamsNoSync()
+{
+  TestUniqueCudaStreams(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestUniqueCudaStreamsNoSync);
+
+
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3>
 __global__
 void unique_copy_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 result1, Iterator3 result2)
@@ -227,7 +252,16 @@ void TestUniqueCopyDeviceDevice()
 DECLARE_UNITTEST(TestUniqueCopyDeviceDevice);
 
 
-void TestUniqueCopyCudaStreams()
+void TestUniqueCopyDeviceNoSync()
+{
+  TestUniqueCopyDevice(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestUniqueCopyDeviceNoSync);
+#endif
+
+
+template<typename ExecutionPolicy>
+void TestUniqueCopyCudaStreams(ExecutionPolicy policy)
 {
   typedef thrust::device_vector<int> Vector;
   typedef Vector::value_type T;
@@ -251,8 +285,10 @@ void TestUniqueCopyCudaStreams()
 
   cudaStream_t s;
   cudaStreamCreate(&s);
+
+  auto streampolicy = policy.on(s);
   
-  new_last = thrust::unique_copy(thrust::cuda::par.on(s), data.begin(), data.end(), output.begin());
+  new_last = thrust::unique_copy(streampolicy, data.begin(), data.end(), output.begin());
   cudaStreamSynchronize(s);
 
   ASSERT_EQUAL(new_last - output.begin(), 7);
@@ -264,7 +300,7 @@ void TestUniqueCopyCudaStreams()
   ASSERT_EQUAL(output[5], 31);
   ASSERT_EQUAL(output[6], 37);
 
-  new_last = thrust::unique_copy(thrust::cuda::par.on(s), output.begin(), new_last, data.begin(), is_equal_div_10_unique<T>());
+  new_last = thrust::unique_copy(streampolicy, output.begin(), new_last, data.begin(), is_equal_div_10_unique<T>());
   cudaStreamSynchronize(s);
 
   ASSERT_EQUAL(new_last - data.begin(), 3);
@@ -274,5 +310,144 @@ void TestUniqueCopyCudaStreams()
 
   cudaStreamDestroy(s);
 }
-DECLARE_UNITTEST(TestUniqueCopyCudaStreams);
+
+void TestUniqueCopyCudaStreamsSync()
+{
+  TestUniqueCopyCudaStreams(thrust::cuda::par);
+}
+DECLARE_UNITTEST(TestUniqueCopyCudaStreamsSync);
+
+
+void TestUniqueCopyCudaStreamsNoSync()
+{
+  TestUniqueCopyCudaStreams(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestUniqueCopyCudaStreamsNoSync);
+
+
+#ifdef THRUST_TEST_DEVICE_SIDE
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2>
+__global__
+void unique_count_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 result)
+{
+  *result = thrust::unique_count(exec, first, last);
+}
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename BinaryPredicate, typename Iterator2>
+__global__
+void unique_count_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, BinaryPredicate pred, Iterator2 result)
+{
+  *result = thrust::unique_count(exec, first, last, pred);
+}
+
+
+template<typename ExecutionPolicy>
+void TestUniqueCountDevice(ExecutionPolicy exec)
+{
+  typedef thrust::device_vector<int> Vector;
+  typedef Vector::value_type T;
+
+  Vector data(10);
+  data[0] = 11; 
+  data[1] = 11; 
+  data[2] = 12;
+  data[3] = 20; 
+  data[4] = 29; 
+  data[5] = 21; 
+  data[6] = 21; 
+  data[7] = 31; 
+  data[8] = 31; 
+  data[9] = 37; 
+  
+  Vector output(1, -1);
+  
+  unique_count_kernel<<<1,1>>>(exec, data.begin(), data.end(), output.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  ASSERT_EQUAL(output[0], 7);
+
+  unique_count_kernel<<<1,1>>>(exec, data.begin(), data.end(), is_equal_div_10_unique<T>(), output.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  ASSERT_EQUAL(output[0], 3);
+}
+
+
+void TestUniqueCountDeviceSeq()
+{
+  TestUniqueCountDevice(thrust::seq);
+}
+DECLARE_UNITTEST(TestUniqueCountDeviceSeq);
+
+
+void TestUniqueCountDeviceDevice()
+{
+  TestUniqueCountDevice(thrust::device);
+}
+DECLARE_UNITTEST(TestUniqueCountDeviceDevice);
+
+
+void TestUniqueCountDeviceNoSync()
+{
+  TestUniqueCountDevice(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestUniqueCountDeviceNoSync);
+#endif
+
+
+template<typename ExecutionPolicy>
+void TestUniqueCountCudaStreams(ExecutionPolicy policy)
+{
+  typedef thrust::device_vector<int> Vector;
+  typedef Vector::value_type T;
+
+  Vector data(10);
+  data[0] = 11; 
+  data[1] = 11; 
+  data[2] = 12;
+  data[3] = 20; 
+  data[4] = 29; 
+  data[5] = 21; 
+  data[6] = 21; 
+  data[7] = 31; 
+  data[8] = 31; 
+  data[9] = 37;
+
+  cudaStream_t s;
+  cudaStreamCreate(&s);
+
+  auto streampolicy = policy.on(s);
+  
+  int result = thrust::unique_count(streampolicy, data.begin(), data.end());
+  cudaStreamSynchronize(s);
+
+  ASSERT_EQUAL(result, 7);
+
+  result = thrust::unique_count(streampolicy, data.begin(), data.end(), is_equal_div_10_unique<T>());
+  cudaStreamSynchronize(s);
+
+  ASSERT_EQUAL(result, 3);
+
+  cudaStreamDestroy(s);
+}
+
+void TestUniqueCountCudaStreamsSync()
+{
+  TestUniqueCountCudaStreams(thrust::cuda::par);
+}
+DECLARE_UNITTEST(TestUniqueCountCudaStreamsSync);
+
+
+void TestUniqueCountCudaStreamsNoSync()
+{
+  TestUniqueCountCudaStreams(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestUniqueCountCudaStreamsNoSync);
 
diff --git a/testing/cuda/unique.mk b/testing/cuda/unique.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/unique.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/cuda/unique_by_key.cu b/testing/cuda/unique_by_key.cu
index c58a64d51..d96cbdc6c 100644
--- a/testing/cuda/unique_by_key.cu
+++ b/testing/cuda/unique_by_key.cu
@@ -44,6 +44,7 @@ void initialize_values(Vector& values)
 }
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3>
 __global__
 void unique_by_key_kernel(ExecutionPolicy exec, Iterator1 keys_first, Iterator1 keys_last, Iterator2 values_first, Iterator3 result)
@@ -134,7 +135,16 @@ void TestUniqueByKeyDeviceDevice()
 DECLARE_UNITTEST(TestUniqueByKeyDeviceDevice);
 
 
-void TestUniqueByKeyCudaStreams()
+void TestUniqueByKeyDeviceNoSync()
+{
+  TestUniqueByKeyDevice(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestUniqueByKeyDeviceNoSync);
+#endif
+
+
+template<typename ExecutionPolicy>
+void TestUniqueByKeyCudaStreams(ExecutionPolicy policy)
 {
   typedef thrust::device_vector<int> Vector;
   typedef Vector::value_type T;
@@ -150,8 +160,10 @@ void TestUniqueByKeyCudaStreams()
 
   cudaStream_t s;
   cudaStreamCreate(&s);
+
+  auto streampolicy = policy.on(s);
   
-  new_last = thrust::unique_by_key(thrust::cuda::par.on(s), keys.begin(), keys.end(), values.begin());
+  new_last = thrust::unique_by_key(streampolicy, keys.begin(), keys.end(), values.begin());
   cudaStreamSynchronize(s);
   
   ASSERT_EQUAL(new_last.first  - keys.begin(),   5);
@@ -171,7 +183,7 @@ void TestUniqueByKeyCudaStreams()
   // test BinaryPredicate
   initialize_keys(keys);  initialize_values(values);
   
-  new_last = thrust::unique_by_key(thrust::cuda::par.on(s), keys.begin(), keys.end(), values.begin(), is_equal_div_10_unique<T>());
+  new_last = thrust::unique_by_key(streampolicy, keys.begin(), keys.end(), values.begin(), is_equal_div_10_unique<T>());
   
   ASSERT_EQUAL(new_last.first  - keys.begin(),   3);
   ASSERT_EQUAL(new_last.second - values.begin(), 3);
@@ -185,9 +197,22 @@ void TestUniqueByKeyCudaStreams()
 
   cudaStreamDestroy(s);
 }
-DECLARE_UNITTEST(TestUniqueByKeyCudaStreams);
+
+void TestUniqueByKeyCudaStreamsSync()
+{
+  TestUniqueByKeyCudaStreams(thrust::cuda::par);
+}
+DECLARE_UNITTEST(TestUniqueByKeyCudaStreamsSync);
+
+
+void TestUniqueByKeyCudaStreamsNoSync()
+{
+  TestUniqueByKeyCudaStreams(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestUniqueByKeyCudaStreamsNoSync);
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3, typename Iterator4, typename Iterator5>
 __global__
 void unique_by_key_copy_kernel(ExecutionPolicy exec, Iterator1 keys_first, Iterator1 keys_last, Iterator2 values_first, Iterator3 keys_result, Iterator4 values_result, Iterator5 result)
@@ -282,7 +307,16 @@ void TestUniqueCopyByKeyDeviceDevice()
 DECLARE_UNITTEST(TestUniqueCopyByKeyDeviceDevice);
 
 
-void TestUniqueCopyByKeyCudaStreams()
+void TestUniqueCopyByKeyDeviceNoSync()
+{
+  TestUniqueCopyByKeyDevice(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestUniqueCopyByKeyDeviceNoSync);
+#endif
+
+
+template<typename ExecutionPolicy>
+void TestUniqueCopyByKeyCudaStreams(ExecutionPolicy policy)
 {
   typedef thrust::device_vector<int> Vector;
   typedef Vector::value_type T;
@@ -302,7 +336,9 @@ void TestUniqueCopyByKeyCudaStreams()
   cudaStream_t s;
   cudaStreamCreate(&s);
 
-  new_last = thrust::unique_by_key_copy(thrust::cuda::par.on(s), keys.begin(), keys.end(), values.begin(), output_keys.begin(), output_values.begin());
+  auto streampolicy = policy.on(s);
+
+  new_last = thrust::unique_by_key_copy(streampolicy, keys.begin(), keys.end(), values.begin(), output_keys.begin(), output_values.begin());
   cudaStreamSynchronize(s);
 
   ASSERT_EQUAL(new_last.first  - output_keys.begin(),   5);
@@ -322,7 +358,7 @@ void TestUniqueCopyByKeyCudaStreams()
   // test BinaryPredicate
   initialize_keys(keys);  initialize_values(values);
   
-  new_last = thrust::unique_by_key_copy(thrust::cuda::par.on(s), keys.begin(), keys.end(), values.begin(), output_keys.begin(), output_values.begin(), is_equal_div_10_unique<T>());
+  new_last = thrust::unique_by_key_copy(streampolicy, keys.begin(), keys.end(), values.begin(), output_keys.begin(), output_values.begin(), is_equal_div_10_unique<T>());
   cudaStreamSynchronize(s);
 
   ASSERT_EQUAL(new_last.first  - output_keys.begin(),   3);
@@ -337,5 +373,17 @@ void TestUniqueCopyByKeyCudaStreams()
 
   cudaStreamDestroy(s);
 }
-DECLARE_UNITTEST(TestUniqueCopyByKeyCudaStreams);
+
+void TestUniqueCopyByKeyCudaStreamsSync()
+{
+  TestUniqueCopyByKeyCudaStreams(thrust::cuda::par);
+}
+DECLARE_UNITTEST(TestUniqueCopyByKeyCudaStreamsSync);
+
+
+void TestUniqueCopyByKeyCudaStreamsNoSync()
+{
+  TestUniqueCopyByKeyCudaStreams(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestUniqueCopyByKeyCudaStreamsNoSync);
 
diff --git a/testing/cuda/unique_by_key.mk b/testing/cuda/unique_by_key.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/unique_by_key.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/dependencies_aware_policies.cu b/testing/dependencies_aware_policies.cu
index 5f48bf4f2..531339215 100644
--- a/testing/dependencies_aware_policies.cu
+++ b/testing/dependencies_aware_policies.cu
@@ -1,5 +1,6 @@
 #include <unittest/unittest.h>
 
+#include <thrust/detail/config.h>
 #include <thrust/detail/seq.h>
 #include <thrust/system/cpp/detail/par.h>
 #include <thrust/system/omp/detail/par.h>
@@ -9,7 +10,7 @@
 #  include <thrust/system/cuda/detail/par.h>
 #endif
 
-#if __cplusplus >= 201103L
+#if THRUST_CPP_DIALECT >= 2011
 
 template<typename T>
 struct test_allocator_t
@@ -178,11 +179,11 @@ SimpleUnitTest<
     >
 > TestDependencyAttachmentInstance;
 
-#else
+#else // C++11
 
 void TestDummy()
 {
 }
 DECLARE_UNITTEST(TestDummy);
 
-#endif
+#endif // C++11
diff --git a/testing/device_delete.cu b/testing/device_delete.cu
index 6684cb2b5..12f757fa4 100644
--- a/testing/device_delete.cu
+++ b/testing/device_delete.cu
@@ -4,21 +4,23 @@
 #include <thrust/device_new.h>
 #include <thrust/device_delete.h>
 
+#include <nv/target>
+
 struct Foo
 {
   __host__ __device__
   Foo(void)
-    :set_me_upon_destruction(0)
+    : set_me_upon_destruction{nullptr}
   {}
 
   __host__ __device__
   ~Foo(void)
   {
-#ifdef __CUDA_ARCH__
-    // __device__ overload
-    if(set_me_upon_destruction != 0)
-      *set_me_upon_destruction = true;
-#endif
+    NV_IF_TARGET(NV_IS_DEVICE, (
+      if (set_me_upon_destruction != nullptr)
+      {
+        *set_me_upon_destruction = true;
+      }));
   }
 
   bool *set_me_upon_destruction;
diff --git a/testing/docs/doxybook_test.h b/testing/docs/doxybook_test.h
new file mode 100644
index 000000000..244648ee1
--- /dev/null
+++ b/testing/docs/doxybook_test.h
@@ -0,0 +1,222 @@
+/*
+ *  Copyright 2008-2020 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file
+ *  \brief Test case for Doxybook rendering.
+ */
+
+#pragma once
+
+namespace thrust
+{
+
+/*! \addtogroup test Test
+ *  \{
+ */
+
+/*! \brief \c test_predefined_friend_struct is a class intended to exercise and
+ *  test Doxybook rendering.
+ */
+template <typename... Z>
+struct test_predefined_friend_struct {};
+
+/*! \brief \c test_predefined_friend_function is a function intended to
+ *  exercise and test Doxybook rendering.
+ */
+template <typename Z>
+void test_predefined_friend_function();
+
+/*! \brief \c test_class is a class intended to exercise and test Doxybook
+ *  rendering.
+ *
+ *  It does many things.
+ *
+ *  \tparam T A template parameter.
+ *  \tparam U Another template parameter.
+ *
+ *  \see test_function
+ */
+template <typename T, typename U>
+class test_class
+{
+public:
+  template <typename Z>
+  struct test_nested_struct {};
+
+  int test_member_variable = 0; ///< A test member variable.
+
+  [[deprecated]] static constexpr int test_member_constant = 42; ///< A test member constant.
+
+  template <typename X, typename Y>
+  using test_type_alias = test_class<X, Y>;
+
+  enum class test_enum_class {
+    A = 15, ///< An enumerator. It is equal to 15.
+    B,
+    C
+  };
+
+  /*! \brief Construct an empty test class.
+   */
+  test_class() = default;
+
+  /*! \brief Construct a test class.
+   */
+  __host__ __device__ constexpr
+  test_class(int);
+
+  /*! \brief \c test_member_function is a function intended to exercise
+   *  and test Doxybook rendering.
+   */
+  __host__ __device__ constexpr
+  int test_member_function() = 0;
+
+  /*! \brief \c test_virtual_member_function is a function intended to exercise
+   *  and test Doxybook rendering.
+   */
+  __host__ __device__
+  virtual int test_virtual_member_function() = 0;
+
+  /*! \brief \c test_parameter_overflow_member_function is a function intended
+   *  to test Doxybook's rendering of function and template parameters that exceed
+   *  the length of a line.
+   */
+  template <typename A = test_predefined_friend_struct<int, int, int, int, int, int, int, int, int, int, int, int>,
+            typename B = test_predefined_friend_struct<int, int, int, int, int, int, int, int, int, int, int, int>,
+            typename C = test_predefined_friend_struct<int, int, int, int, int, int, int, int, int, int, int, int>>
+  test_predefined_friend_struct<int, int, int, int, int, int, int, int, int, int, int, int, int, int, int>
+  test_parameter_overflow_member_function(test_predefined_friend_struct<int, int, int, int, int, int, int, int, int, int, int, int, int, int, int> a,
+                                          test_predefined_friend_struct<int, int, int, int, int, int, int, int, int, int, int, int, int, int, int> b,
+                                          test_predefined_friend_struct<int, int, int, int, int, int, int, int, int, int, int, int, int, int, int> c);
+
+  template <typename Z>
+  friend void test_friend_function() {}
+
+  template <typename Z>
+  friend void test_predefined_friend_function();
+
+  template <typename... Z>
+  friend struct thrust::test_predefined_friend_struct;
+
+protected:
+
+  template <typename Z>
+  class test_protected_nested_class {};
+
+  /*! \brief \c test_protected_member_function is a function intended to
+   *  exercise and test Doxybook rendering.
+   */
+  __device__
+  auto test_protected_member_function();
+};
+
+/*! \brief \c test_derived_class is a derived class intended to exercise and
+ *  test Doxybook rendering.
+ */
+class test_derived_class : test_class<int, double>
+{
+  template <typename Z>
+  struct test_derived_nested_struct {};
+
+  double test_derived_member_variable = 3.14; ///< A test member variable.
+
+  typedef double test_typedef;
+
+  /*! \brief \c test_derived_member_function is a function intended to exercise
+   *  and test Doxybook rendering.
+   */
+  __host__ __device__ constexpr
+  double test_derived_member_function(int, int);
+};
+
+/*! \brief \c test_function is a function intended to exercise and test Doxybook
+ *  rendering.
+ *
+ *  \tparam T A template parameter.
+ *
+ *  \param a A function parameter.
+ *  \param b A function parameter.
+ */
+template <typename T>
+void test_function(T const& a, test_class<T, T const>&& b);
+
+/*! \brief \c test_parameter_overflow_function is a function intended to test
+ *  Doxybook's rendering of function and template parameters that exceed the
+ *  length of a line.
+ */
+template <typename T = test_predefined_friend_struct<int, int, int, int, int, int, int, int, int, int, int, int>,
+  typename U = test_predefined_friend_struct<int, int, int, int, int, int, int, int, int, int, int, int>,
+  typename V = test_predefined_friend_struct<int, int, int, int, int, int, int, int, int, int, int, int>
+>
+test_predefined_friend_struct<int, int, int, int, int, int, int, int, int, int, int, int, int, int, int>
+test_parameter_overflow_function(test_predefined_friend_struct<int, int, int, int, int, int, int, int, int, int, int, int> t,
+  test_predefined_friend_struct<int, int, int, int, int, int, int, int, int, int, int, int> u,
+  test_predefined_friend_struct<int, int, int, int, int, int, int, int, int, int, int, int> v);
+
+/*! \brief \c test_enum is an enum namespace intended to exercise and test
+ *  Doxybook rendering.
+ */
+enum class test_enum {
+  X = 1, ///< An enumerator. It is equal to 1.
+  Y = X,
+  Z = 2
+};
+
+/*! \brief \c test_alias is a type alias intended to exercise and test Doxybook
+ * rendering.
+ */
+using test_alias = test_class<int, double>;
+
+/*! \brief \c test_namespace is a namespace intended to exercise and test
+ *  Doxybook rendering.
+ */
+namespace test_namespace {
+
+inline constexpr int test_constant = 12;
+
+/*! \brief \c nested_function is a function intended to exercise and test
+ *  Doxybook rendering.
+ */
+template <typename T, typename U>
+auto test_nested_function(T t, U u) noexcept(noexcept(t + u)) -> decltype(t + u)
+{ return t + u; }
+
+/*! \brief \c test_struct is a struct intended to exercise and test Doxybook
+ *  rendering.
+ */
+template <typename Z>
+struct test_struct
+{
+  test_struct& operator=(test_struct const&) = default;
+
+  /*! \brief \c operator< is a function intended to exercise and test Doxybook
+   *  rendering.
+   */
+  bool operator<(test_struct const& t);
+};
+
+} // namespace test_namespace
+
+/*! \brief \c THRUST_TEST_MACRO is a macro intended to exercise and test
+ *  Doxybook rendering.
+ */
+#define THRUST_TEST_MACRO(x, y) thrust::test_namespace::nested_function(x, y)
+
+/*! \} // test
+ */
+
+} // namespace thrust
+
diff --git a/testing/equal.cu b/testing/equal.cu
index 932f3ccfd..ca9f7eb69 100644
--- a/testing/equal.cu
+++ b/testing/equal.cu
@@ -2,6 +2,8 @@
 #include <thrust/equal.h>
 #include <thrust/functional.h>
 #include <thrust/iterator/retag.h>
+#include <thrust/device_malloc.h>
+#include <thrust/device_free.h>
 
 template <class Vector>
 void TestEqualSimple(void)
@@ -102,3 +104,48 @@ void TestEqualDispatchImplicit()
 }
 DECLARE_UNITTEST(TestEqualDispatchImplicit);
 
+struct only_set_when_both_expected
+{
+    long long expected;
+    bool * flag;
+
+    __device__
+    bool operator()(long long x, long long y)
+    {
+        if (x == expected && y == expected)
+        {
+            *flag = true;
+        }
+
+        return x == y;
+    }
+};
+
+void TestEqualWithBigIndexesHelper(int magnitude)
+{
+    thrust::counting_iterator<long long> begin(1);
+    thrust::counting_iterator<long long> end = begin + (1ll << magnitude);
+    ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude);
+
+    thrust::device_ptr<bool> has_executed = thrust::device_malloc<bool>(1);
+    *has_executed = false;
+
+    only_set_when_both_expected fn = { (1ll << magnitude) - 1,
+        thrust::raw_pointer_cast(has_executed) };
+
+    ASSERT_EQUAL(thrust::equal(thrust::device, begin, end, begin, fn), true);
+
+    bool has_executed_h = *has_executed;
+    thrust::device_free(has_executed);
+
+    ASSERT_EQUAL(has_executed_h, true);
+}
+
+void TestEqualWithBigIndexes()
+{
+    TestEqualWithBigIndexesHelper(30);
+    TestEqualWithBigIndexesHelper(31);
+    TestEqualWithBigIndexesHelper(32);
+    TestEqualWithBigIndexesHelper(33);
+}
+DECLARE_UNITTEST(TestEqualWithBigIndexes);
diff --git a/testing/event.cu b/testing/event.cu
index a02f15fd7..581426919 100644
--- a/testing/event.cu
+++ b/testing/event.cu
@@ -1,6 +1,6 @@
 #include <thrust/detail/config.h>
 
-#if THRUST_CPP_DIALECT >= 2011 && !defined(THRUST_LEGACY_GCC)
+#if THRUST_CPP_DIALECT >= 2014
 
 #include <unittest/unittest.h>
 #include <unittest/util_async.h>
@@ -58,8 +58,6 @@ void test_event_new_stream()
 {
   auto e0 = thrust::device_event(thrust::new_stream);
 
-  auto e0_stream = e0.stream().native_handle();
-
   ASSERT_EQUAL(true, e0.valid_stream());
 
   ASSERT_NOT_EQUAL_QUIET(nullptr, e0.stream().native_handle());    
diff --git a/testing/fill.cu b/testing/fill.cu
index ec32dcd30..7154b4118 100644
--- a/testing/fill.cu
+++ b/testing/fill.cu
@@ -22,17 +22,17 @@ void TestFillSimple(void)
     ASSERT_EQUAL(v[2], 7);
     ASSERT_EQUAL(v[3], 7);
     ASSERT_EQUAL(v[4], 4);
-    
+
     thrust::fill(v.begin() + 0, v.begin() + 3, (T) 8);
-    
+
     ASSERT_EQUAL(v[0], 8);
     ASSERT_EQUAL(v[1], 8);
     ASSERT_EQUAL(v[2], 8);
     ASSERT_EQUAL(v[3], 7);
     ASSERT_EQUAL(v[4], 4);
-    
+
     thrust::fill(v.begin() + 2, v.end(), (T) 9);
-    
+
     ASSERT_EQUAL(v[0], 8);
     ASSERT_EQUAL(v[1], 8);
     ASSERT_EQUAL(v[2], 9);
@@ -40,7 +40,7 @@ void TestFillSimple(void)
     ASSERT_EQUAL(v[4], 9);
 
     thrust::fill(v.begin(), v.end(), (T) 1);
-    
+
     ASSERT_EQUAL(v[0], 1);
     ASSERT_EQUAL(v[1], 1);
     ASSERT_EQUAL(v[2], 1);
@@ -70,14 +70,14 @@ void TestFillMixedTypes(void)
     Vector v(4);
 
     thrust::fill(v.begin(), v.end(), bool(true));
-    
+
     ASSERT_EQUAL(v[0], 1);
     ASSERT_EQUAL(v[1], 1);
     ASSERT_EQUAL(v[2], 1);
     ASSERT_EQUAL(v[3], 1);
-    
+
     thrust::fill(v.begin(), v.end(), char(20));
-    
+
     ASSERT_EQUAL(v[0], 20);
     ASSERT_EQUAL(v[1], 20);
     ASSERT_EQUAL(v[2], 20);
@@ -101,17 +101,17 @@ void TestFill(size_t n)
     thrust::fill(d_data.begin() + std::min((size_t)117, n), d_data.begin() + std::min((size_t)367, n), (T) 1);
 
     ASSERT_EQUAL(h_data, d_data);
-    
+
     thrust::fill(h_data.begin() + std::min((size_t)8, n), h_data.begin() + std::min((size_t)259, n), (T) 2);
     thrust::fill(d_data.begin() + std::min((size_t)8, n), d_data.begin() + std::min((size_t)259, n), (T) 2);
 
     ASSERT_EQUAL(h_data, d_data);
-    
+
     thrust::fill(h_data.begin() + std::min((size_t)3, n), h_data.end(), (T) 3);
     thrust::fill(d_data.begin() + std::min((size_t)3, n), d_data.end(), (T) 3);
 
     ASSERT_EQUAL(h_data, d_data);
-    
+
     thrust::fill(h_data.begin(), h_data.end(), (T) 4);
     thrust::fill(d_data.begin(), d_data.end(), (T) 4);
 
@@ -135,18 +135,18 @@ void TestFillNSimple(void)
     ASSERT_EQUAL(v[3], 7);
     ASSERT_EQUAL(v[4], 4);
     ASSERT_EQUAL_QUIET(v.begin() + 4, iter);
-    
+
     iter = thrust::fill_n(v.begin() + 0, 3, (T) 8);
-    
+
     ASSERT_EQUAL(v[0], 8);
     ASSERT_EQUAL(v[1], 8);
     ASSERT_EQUAL(v[2], 8);
     ASSERT_EQUAL(v[3], 7);
     ASSERT_EQUAL(v[4], 4);
     ASSERT_EQUAL_QUIET(v.begin() + 3, iter);
-    
+
     iter = thrust::fill_n(v.begin() + 2, 3, (T) 9);
-    
+
     ASSERT_EQUAL(v[0], 8);
     ASSERT_EQUAL(v[1], 8);
     ASSERT_EQUAL(v[2], 9);
@@ -155,7 +155,7 @@ void TestFillNSimple(void)
     ASSERT_EQUAL_QUIET(v.end(), iter);
 
     iter = thrust::fill_n(v.begin(), v.size(), (T) 1);
-    
+
     ASSERT_EQUAL(v[0], 1);
     ASSERT_EQUAL(v[1], 1);
     ASSERT_EQUAL(v[2], 1);
@@ -192,15 +192,15 @@ void TestFillNMixedTypes(void)
     Vector v(4);
 
     typename Vector::iterator iter = thrust::fill_n(v.begin(), v.size(), bool(true));
-    
+
     ASSERT_EQUAL(v[0], 1);
     ASSERT_EQUAL(v[1], 1);
     ASSERT_EQUAL(v[2], 1);
     ASSERT_EQUAL(v[3], 1);
     ASSERT_EQUAL_QUIET(v.end(), iter);
-    
+
     iter = thrust::fill_n(v.begin(), v.size(), char(20));
-    
+
     ASSERT_EQUAL(v[0], 20);
     ASSERT_EQUAL(v[1], 20);
     ASSERT_EQUAL(v[2], 20);
@@ -227,19 +227,19 @@ void TestFillN(size_t n)
     thrust::fill_n(d_data.begin() + begin_offset, std::min((size_t)367, n) - begin_offset, (T) 1);
 
     ASSERT_EQUAL(h_data, d_data);
-    
+
     begin_offset = std::min<size_t>(8, n);
     thrust::fill_n(h_data.begin() + begin_offset, std::min((size_t)259, n) - begin_offset, (T) 2);
     thrust::fill_n(d_data.begin() + begin_offset, std::min((size_t)259, n) - begin_offset, (T) 2);
 
     ASSERT_EQUAL(h_data, d_data);
-    
+
     begin_offset = std::min<size_t>(3, n);
     thrust::fill_n(h_data.begin() + begin_offset, h_data.size() - begin_offset, (T) 3);
     thrust::fill_n(d_data.begin() + begin_offset, d_data.size() - begin_offset, (T) 3);
 
     ASSERT_EQUAL(h_data, d_data);
-    
+
     thrust::fill_n(h_data.begin(), h_data.size(), (T) 4);
     thrust::fill_n(d_data.begin(), d_data.size(), (T) 4);
 
@@ -301,7 +301,7 @@ void TestFillWithTrivialAssignment(void)
 
     thrust::host_vector<T>   h(1);
     thrust::device_vector<T> d(1);
-    
+
     ASSERT_EQUAL(h[0].x, 0);
     ASSERT_EQUAL(h[0].y, 0);
     ASSERT_EQUAL(h[0].z, 0);
@@ -334,6 +334,10 @@ struct TypeWithNonTrivialAssigment
   __host__ __device__
   TypeWithNonTrivialAssigment() : x(0), y(0), z(0) {}
 
+#if THRUST_CPP_DIALECT >= 2011
+  TypeWithNonTrivialAssigment(const TypeWithNonTrivialAssigment &) = default;
+#endif
+
   __host__ __device__
   TypeWithNonTrivialAssigment& operator=(const TypeWithNonTrivialAssigment& t)
   {
@@ -342,7 +346,7 @@ struct TypeWithNonTrivialAssigment
     z = t.x + t.y;
     return *this;
   }
-  
+
   __host__ __device__
   bool operator==(const TypeWithNonTrivialAssigment& t) const
   {
@@ -356,7 +360,7 @@ void TestFillWithNonTrivialAssignment(void)
 
     thrust::host_vector<T>   h(1);
     thrust::device_vector<T> d(1);
-    
+
     ASSERT_EQUAL(h[0].x, 0);
     ASSERT_EQUAL(h[0].y, 0);
     ASSERT_EQUAL(h[0].z, 0);
diff --git a/testing/find.cu b/testing/find.cu
index 7c91320a1..988afbeef 100644
--- a/testing/find.cu
+++ b/testing/find.cu
@@ -1,4 +1,5 @@
 #include <unittest/unittest.h>
+#include <thrust/sequence.h>
 #include <thrust/find.h>
 #include <thrust/iterator/retag.h>
 
@@ -304,3 +305,69 @@ struct TestFindIfNot
 };
 VariableUnitTest<TestFindIfNot, SignedIntegralTypes> TestFindIfNotInstance;
 
+void TestFindWithBigIndexesHelper(int magnitude)
+{
+    thrust::counting_iterator<long long> begin(1);
+    thrust::counting_iterator<long long> end = begin + (1ll << magnitude);
+    ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude);
+
+    thrust::detail::intmax_t distance_low_value = thrust::distance(
+        begin,
+        thrust::find(
+            thrust::device,
+            begin,
+            end,
+            17));
+
+    thrust::detail::intmax_t distance_high_value = thrust::distance(
+        begin,
+        thrust::find(
+            thrust::device,
+            begin,
+            end,
+            (1ll << magnitude) - 17));
+
+    ASSERT_EQUAL(distance_low_value, 16);
+    ASSERT_EQUAL(distance_high_value, (1ll << magnitude) - 18);
+}
+
+void TestFindWithBigIndexes()
+{
+    TestFindWithBigIndexesHelper(30);
+    TestFindWithBigIndexesHelper(31);
+    TestFindWithBigIndexesHelper(32);
+    TestFindWithBigIndexesHelper(33);
+}
+DECLARE_UNITTEST(TestFindWithBigIndexes);
+
+namespace
+{
+
+class Weird
+{
+  int value;
+
+public:
+  __host__ __device__ Weird(int val, int)
+      : value(val)
+  {}
+
+  friend __host__ __device__
+  bool operator==(int x, Weird y)
+  {
+    return x == y.value;
+  }
+};
+
+} // end anon namespace
+
+void TestFindAsymmetricEquality()
+{ // Regression test for NVIDIA/thrust#1229
+  thrust::host_vector<int> v(1000);
+  thrust::sequence(v.begin(), v.end());
+  thrust::device_vector<int> dv(v);
+  auto result = thrust::find(dv.begin(), dv.end(), Weird(333, 0));
+  ASSERT_EQUAL(*result, 333);
+  ASSERT_EQUAL(result - dv.begin(), 333);
+}
+DECLARE_UNITTEST(TestFindAsymmetricEquality);
diff --git a/testing/fix_clang_nvcc_11.5.h b/testing/fix_clang_nvcc_11.5.h
new file mode 100644
index 000000000..279dca3f9
--- /dev/null
+++ b/testing/fix_clang_nvcc_11.5.h
@@ -0,0 +1,24 @@
+#pragma once
+
+#if defined(__NVCC__) && defined(__clang__) && __CUDACC_VER_MAJOR__ == 11 &&                       \
+    __CUDACC_VER_MINOR__ <= 5
+
+#if defined(__NVCC_DIAG_PRAGMA_SUPPORT__)
+#  pragma nv_diag_suppress 3171
+#else
+#  pragma diag_suppress 3171
+#endif
+
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wkeyword-compat"
+
+// Clang has a builtin called `__is_signed`. Unfortunately, libstdc++ headers
+// use this name as an identifier. Clang has a workaround for that, it checks 
+// if `__is_signed` is `const static bool` as in libstdc++ headers and if so,
+// disables the intrinsic for the rest of the TU:
+// https://github.com/llvm/llvm-project/blob/f49b6afc231242dfee027d5da69734836097cd43/clang/lib/Parse/ParseDecl.cpp#L3552-L3566
+const static bool __is_signed = false;
+
+#pragma clang diagnostic pop
+#endif // defined(__NVCC__) && defined(__clang__) && __CUDACC_VER_MAJOR__ == 11 &&
+       //   __CUDACC_VER_MINOR__ <= 5
diff --git a/testing/for_each.cu b/testing/for_each.cu
index 0e9e4ef5c..8040e5f78 100644
--- a/testing/for_each.cu
+++ b/testing/for_each.cu
@@ -355,7 +355,7 @@ DECLARE_UNITTEST(TestForEachNWithLargeTypes);
 
 THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_END
 
-struct OnlySetWhenExpected
+struct only_set_when_expected
 {
     unsigned long long expected;
     bool * flag;
@@ -379,7 +379,7 @@ void TestForEachWithBigIndexesHelper(int magnitude)
     thrust::device_ptr<bool> has_executed = thrust::device_malloc<bool>(1);
     *has_executed = false;
 
-    OnlySetWhenExpected fn = { (1ull << magnitude) - 1, thrust::raw_pointer_cast(has_executed) };
+    only_set_when_expected fn = { (1ull << magnitude) - 1, thrust::raw_pointer_cast(has_executed) };
 
     thrust::for_each(thrust::device, begin, end, fn);
 
diff --git a/testing/functional.cu b/testing/functional.cu
index 3b758c9b3..1d1a79b6c 100644
--- a/testing/functional.cu
+++ b/testing/functional.cu
@@ -296,6 +296,19 @@ void TestNot1(void)
 }
 DECLARE_INTEGRAL_VECTOR_UNITTEST(TestNot1);
 
+
+// GCC 11 fails to build this test case with a spurious error in a
+// very specific scenario:
+// - GCC 11
+// - CPP system for both host and device
+// - C++11 dialect
+#if !(defined(THRUST_GCC_VERSION) &&				\
+      THRUST_GCC_VERSION >= 110000 &&				\
+      THRUST_GCC_VERSION < 120000 &&				\
+      THRUST_HOST_SYSTEM == THRUST_HOST_SYSTEM_CPP &&		\
+      THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CPP &&	\
+      THRUST_CPP_DIALECT == 2011)
+
 template <class Vector>
 void TestNot2(void)
 {
@@ -321,4 +334,6 @@ void TestNot2(void)
 }
 DECLARE_VECTOR_UNITTEST(TestNot2);
 
+#endif // Weird GCC11 failure case
+
 THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_END
diff --git a/testing/functional_placeholders_arithmetic.cu b/testing/functional_placeholders_arithmetic.cu
index 4376b46a9..8d8535aa6 100644
--- a/testing/functional_placeholders_arithmetic.cu
+++ b/testing/functional_placeholders_arithmetic.cu
@@ -65,8 +65,8 @@ template<typename T>
   struct unary_plus_reference
 {
   __host__ __device__ T operator()(const T &x) const
-  {
-    return +x;
+  { // Static cast to undo integral promotion
+    return static_cast<T>(+x);
   }
 };
 
diff --git a/testing/functional_placeholders_bitwise.cu b/testing/functional_placeholders_bitwise.cu
index bfefb9771..7c92d967f 100644
--- a/testing/functional_placeholders_bitwise.cu
+++ b/testing/functional_placeholders_bitwise.cu
@@ -3,16 +3,18 @@
 #include <thrust/transform.h>
 #include <thrust/iterator/constant_iterator.h>
 
+#include <thrust/detail/allocator/allocator_traits.h>
+
 static const size_t num_samples = 10000;
 
 template<typename Vector, typename U> struct rebind_vector;
 
-// TODO: C++11: use rebind from allocator_traits
 template<typename T, typename U, typename Allocator>
   struct rebind_vector<thrust::host_vector<T, Allocator>, U>
 {
-  typedef thrust::host_vector<U,
-    typename Allocator::template rebind<U>::other> type;
+  typedef typename thrust::detail::allocator_traits<Allocator> alloc_traits;
+  typedef typename alloc_traits::template rebind_alloc<U> new_alloc;
+  typedef thrust::host_vector<U, new_alloc> type;
 };
 
 template<typename T, typename U, typename Allocator>
@@ -22,17 +24,24 @@ template<typename T, typename U, typename Allocator>
     typename Allocator::template rebind<U>::other> type;
 };
 
+template<typename T, typename U, typename Allocator>
+  struct rebind_vector<thrust::universal_vector<T, Allocator>, U>
+{
+  typedef thrust::universal_vector<U,
+    typename Allocator::template rebind<U>::other> type;
+};
+
 #define BINARY_FUNCTIONAL_PLACEHOLDERS_TEST(name, op, reference_functor, type_list) \
 template<typename Vector> \
   struct TestFunctionalPlaceholders##name \
 { \
   void operator()(const size_t) \
   { \
-    static const size_t num_samples = 10000; \
-    const size_t zero = 0; \
+    constexpr size_t NUM_SAMPLES = 10000; \
+    constexpr size_t ZERO = 0; \
     typedef typename Vector::value_type T; \
-    Vector lhs = unittest::random_samples<T>(num_samples); \
-    Vector rhs = unittest::random_samples<T>(num_samples); \
+    Vector lhs = unittest::random_samples<T>(NUM_SAMPLES); \
+    Vector rhs = unittest::random_samples<T>(NUM_SAMPLES); \
     thrust::replace(rhs.begin(), rhs.end(), T(0), T(1)); \
 \
     Vector reference(lhs.size()); \
@@ -47,7 +56,7 @@ template<typename Vector> \
     thrust::transform(lhs.begin(), lhs.end(), result.begin(), _1 op T(1)); \
     ASSERT_ALMOST_EQUAL(reference, result); \
 \
-    thrust::transform(thrust::make_constant_iterator<T>(1,zero), thrust::make_constant_iterator<T>(1,num_samples), rhs.begin(), reference.begin(), reference_functor<T>()); \
+    thrust::transform(thrust::make_constant_iterator<T>(1,ZERO), thrust::make_constant_iterator<T>(1,NUM_SAMPLES), rhs.begin(), reference.begin(), reference_functor<T>()); \
     thrust::transform(rhs.begin(), rhs.end(), result.begin(), T(1) op _1); \
     ASSERT_ALMOST_EQUAL(reference, result); \
   } \
diff --git a/testing/functional_placeholders_logical.cu b/testing/functional_placeholders_logical.cu
index 7fcb640fe..caca82040 100644
--- a/testing/functional_placeholders_logical.cu
+++ b/testing/functional_placeholders_logical.cu
@@ -2,16 +2,18 @@
 #include <thrust/functional.h>
 #include <thrust/transform.h>
 
+#include <thrust/detail/allocator/allocator_traits.h>
+
 static const size_t num_samples = 10000;
 
 template<typename Vector, typename U> struct rebind_vector;
 
-// TODO: C++11: use rebind from allocator_traits
 template<typename T, typename U, typename Allocator>
   struct rebind_vector<thrust::host_vector<T, Allocator>, U>
 {
-  typedef thrust::host_vector<U,
-    typename Allocator::template rebind<U>::other> type;
+  typedef typename thrust::detail::allocator_traits<Allocator> alloc_traits;
+  typedef typename alloc_traits::template rebind_alloc<U> new_alloc;
+  typedef thrust::host_vector<U, new_alloc> type;
 };
 
 template<typename T, typename U, typename Allocator>
@@ -21,6 +23,13 @@ template<typename T, typename U, typename Allocator>
     typename Allocator::template rebind<U>::other> type;
 };
 
+template<typename T, typename U, typename Allocator>
+  struct rebind_vector<thrust::universal_vector<T, Allocator>, U>
+{
+  typedef thrust::universal_vector<U,
+    typename Allocator::template rebind<U>::other> type;
+};
+
 #define BINARY_FUNCTIONAL_PLACEHOLDERS_TEST(name, reference_operator, functor) \
 template<typename Vector> \
   void TestFunctionalPlaceholders##name(void) \
diff --git a/testing/functional_placeholders_relational.cu b/testing/functional_placeholders_relational.cu
index 8114ef55e..7f088a1ea 100644
--- a/testing/functional_placeholders_relational.cu
+++ b/testing/functional_placeholders_relational.cu
@@ -2,16 +2,18 @@
 #include <thrust/functional.h>
 #include <thrust/transform.h>
 
+#include <thrust/detail/allocator/allocator_traits.h>
+
 static const size_t num_samples = 10000;
 
 template<typename Vector, typename U> struct rebind_vector;
 
-// TODO: C++11: use rebind from allocator_traits
 template<typename T, typename U, typename Allocator>
   struct rebind_vector<thrust::host_vector<T, Allocator>, U>
 {
-  typedef thrust::host_vector<U,
-    typename Allocator::template rebind<U>::other> type;
+    typedef typename thrust::detail::allocator_traits<Allocator> alloc_traits;
+    typedef typename alloc_traits::template rebind_alloc<U> new_alloc;
+    typedef thrust::host_vector<U, new_alloc> type;
 };
 
 template<typename T, typename U, typename Allocator>
@@ -21,6 +23,13 @@ template<typename T, typename U, typename Allocator>
     typename Allocator::template rebind<U>::other> type;
 };
 
+template<typename T, typename U, typename Allocator>
+  struct rebind_vector<thrust::universal_vector<T, Allocator>, U>
+{
+  typedef thrust::universal_vector<U,
+    typename Allocator::template rebind<U>::other> type;
+};
+
 #define BINARY_FUNCTIONAL_PLACEHOLDERS_TEST(name, reference_operator, functor) \
 template<typename Vector> \
   void TestFunctionalPlaceholdersBinary##name(void) \
diff --git a/testing/future.cu b/testing/future.cu
index 0616230c9..eb1ab582a 100644
--- a/testing/future.cu
+++ b/testing/future.cu
@@ -1,6 +1,6 @@
 #include <thrust/detail/config.h>
 
-#if THRUST_CPP_DIALECT >= 2011 && !defined(THRUST_LEGACY_GCC)
+#if THRUST_CPP_DIALECT >= 2014
 
 #include <unittest/unittest.h>
 #include <unittest/util_async.h>
@@ -102,8 +102,6 @@ struct test_future_new_stream
   {
     auto f0 = thrust::device_future<T>(thrust::new_stream);
 
-    auto f0_stream = f0.stream().native_handle();
-
     ASSERT_EQUAL(true,  f0.valid_stream());
     ASSERT_EQUAL(false, f0.valid_content());
 
diff --git a/testing/inner_product.cu b/testing/inner_product.cu
index c1f77904b..4fae72e88 100644
--- a/testing/inner_product.cu
+++ b/testing/inner_product.cu
@@ -1,6 +1,11 @@
 #include <unittest/unittest.h>
 #include <thrust/inner_product.h>
+
+#include <thrust/functional.h>
 #include <thrust/iterator/retag.h>
+#include <thrust/device_malloc.h>
+#include <thrust/device_free.h>
+#include <thrust/device_vector.h>
 
 template <class Vector>
 void TestInnerProductSimple(void)
@@ -100,4 +105,69 @@ struct TestInnerProduct
 };
 VariableUnitTest<TestInnerProduct, IntegralTypes> TestInnerProductInstance;
 
+struct only_set_when_both_expected
+{
+    long long expected;
+    bool * flag;
+
+    __device__
+    long long operator()(long long x, long long y)
+    {
+        if (x == expected && y == expected)
+        {
+            *flag = true;
+        }
+
+        return x == y;
+    }
+};
+
+void TestInnerProductWithBigIndexesHelper(int magnitude)
+{
+    thrust::counting_iterator<long long> begin(1);
+    thrust::counting_iterator<long long> end = begin + (1ll << magnitude);
+    ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude);
+
+    thrust::device_ptr<bool> has_executed = thrust::device_malloc<bool>(1);
+    *has_executed = false;
+
+    only_set_when_both_expected fn = { (1ll << magnitude) - 1,
+        thrust::raw_pointer_cast(has_executed) };
 
+    ASSERT_EQUAL(thrust::inner_product(
+        thrust::device,
+        begin, end,
+        begin,
+        0ll,
+        thrust::plus<long long>(),
+        fn), (1ll << magnitude));
+
+    bool has_executed_h = *has_executed;
+    thrust::device_free(has_executed);
+
+    ASSERT_EQUAL(has_executed_h, true);
+}
+
+void TestInnerProductWithBigIndexes()
+{
+    TestInnerProductWithBigIndexesHelper(30);
+    TestInnerProductWithBigIndexesHelper(31);
+    TestInnerProductWithBigIndexesHelper(32);
+    TestInnerProductWithBigIndexesHelper(33);
+}
+DECLARE_UNITTEST(TestInnerProductWithBigIndexes);
+
+void TestInnerProductPlaceholders()
+{ // Regression test for NVIDIA/thrust#1178
+  using namespace thrust::placeholders;
+
+  thrust::device_vector<float> v1(100, 1.f);
+  thrust::device_vector<float> v2(100, 1.f);
+
+  auto result = thrust::inner_product(v1.begin(), v1.end(), v2.begin(), 0.0f,
+                                      thrust::plus<float>{},
+                                      _1 * _2 + 1.0f);
+
+  ASSERT_ALMOST_EQUAL(result, 200.f);
+}
+DECLARE_UNITTEST(TestInnerProductPlaceholders);
diff --git a/testing/is_contiguous_iterator.cu b/testing/is_contiguous_iterator.cu
index 63a307b7b..42a5aa663 100644
--- a/testing/is_contiguous_iterator.cu
+++ b/testing/is_contiguous_iterator.cu
@@ -134,3 +134,95 @@ void test_is_contiguous_iterator_vectors()
 }
 DECLARE_VECTOR_UNITTEST(test_is_contiguous_iterator_vectors);
 
+
+struct expect_pointer{};
+struct expect_passthrough{};
+
+template <typename IteratorT,
+          typename PointerT,
+          typename expected_unwrapped_type /* = expect_[pointer|passthrough] */>
+struct check_unwrapped_iterator
+{
+  using unwrapped_t = typename std::remove_reference<
+    decltype(thrust::detail::try_unwrap_contiguous_iterator(
+      std::declval<IteratorT>()))>::type;
+
+  static constexpr bool value =
+    std::is_same<expected_unwrapped_type, expect_pointer>::value
+      ? std::is_same<unwrapped_t, PointerT>::value
+      : std::is_same<unwrapped_t, IteratorT>::value;
+};
+
+template <typename T>
+void test_try_unwrap_contiguous_iterator()
+{
+  // Raw pointers should pass whether expecting pointers or passthrough.
+  THRUST_STATIC_ASSERT((check_unwrapped_iterator<T *,
+                                                 T *,
+                                                 expect_pointer>::value));
+  THRUST_STATIC_ASSERT((check_unwrapped_iterator<T *,
+                                                 T *,
+                                                 expect_passthrough>::value));
+  THRUST_STATIC_ASSERT((check_unwrapped_iterator<T const *,
+                                                 T const *,
+                                                 expect_pointer>::value));
+  THRUST_STATIC_ASSERT((check_unwrapped_iterator<T const *,
+                                                 T const *,
+                                                 expect_passthrough>::value));
+
+  THRUST_STATIC_ASSERT((check_unwrapped_iterator<thrust::device_ptr<T>,
+                                                 T *,
+                                                 expect_pointer>::value));
+  THRUST_STATIC_ASSERT((check_unwrapped_iterator<thrust::device_ptr<T const>,
+                                                 T const *,
+                                                 expect_pointer>::value));
+  THRUST_STATIC_ASSERT((check_unwrapped_iterator<typename std::vector<T>::iterator,
+                                                 T *,
+                                                 expect_pointer>::value));
+  THRUST_STATIC_ASSERT((check_unwrapped_iterator<typename std::vector<T>::reverse_iterator,
+                                                 T *,
+                                                 expect_passthrough>::value));
+  THRUST_STATIC_ASSERT((check_unwrapped_iterator<typename std::array<T, 1>::iterator,
+                                                 T *,
+                                                 expect_pointer>::value));
+  THRUST_STATIC_ASSERT((check_unwrapped_iterator<typename std::array<T const, 1>::iterator,
+                                                 T const *,
+                                                 expect_pointer>::value));
+  THRUST_STATIC_ASSERT((check_unwrapped_iterator<typename std::list<T>::iterator,
+                                                 T *,
+                                                 expect_passthrough>::value));
+  THRUST_STATIC_ASSERT((check_unwrapped_iterator<typename std::deque<T>::iterator,
+                                                 T *,
+                                                 expect_passthrough>::value));
+  THRUST_STATIC_ASSERT((check_unwrapped_iterator<typename std::set<T>::iterator,
+                                                 T *,
+                                                 expect_passthrough>::value));
+  THRUST_STATIC_ASSERT((check_unwrapped_iterator<typename std::multiset<T>::iterator,
+                                                 T *,
+                                                 expect_passthrough>::value));
+  THRUST_STATIC_ASSERT((check_unwrapped_iterator<typename std::map<T, T>::iterator,
+                                                 std::pair<T const, T> *,
+                                                 expect_passthrough>::value));
+  THRUST_STATIC_ASSERT((check_unwrapped_iterator<typename std::multimap<T, T>::iterator,
+                                                 std::pair<T const, T> *,
+                                                 expect_passthrough>::value));
+  THRUST_STATIC_ASSERT((check_unwrapped_iterator<typename std::unordered_set<T>::iterator,
+                                                 T *,
+                                                 expect_passthrough>::value));
+  THRUST_STATIC_ASSERT((check_unwrapped_iterator<typename std::unordered_multiset<T>::iterator,
+                                                 T *,
+                                                 expect_passthrough>::value));
+  THRUST_STATIC_ASSERT((check_unwrapped_iterator<typename std::unordered_map<T, T>::iterator,
+                                                 std::pair<T const, T> *,
+                                                 expect_passthrough>::value));
+  THRUST_STATIC_ASSERT((check_unwrapped_iterator<typename std::unordered_multimap<T, T>::iterator,
+                                                 std::pair<T const, T> *,
+                                                 expect_passthrough>::value));
+  THRUST_STATIC_ASSERT((check_unwrapped_iterator<std::istream_iterator<T>,
+                                                 T *,
+                                                 expect_passthrough>::value));
+  THRUST_STATIC_ASSERT((check_unwrapped_iterator<std::ostream_iterator<T>,
+                                                 void,
+                                                 expect_passthrough>::value));
+}
+DECLARE_GENERIC_UNITTEST(test_try_unwrap_contiguous_iterator);
diff --git a/testing/max_element.cu b/testing/max_element.cu
index e73275c63..456239264 100644
--- a/testing/max_element.cu
+++ b/testing/max_element.cu
@@ -105,3 +105,20 @@ void TestMaxElementDispatchImplicit()
 }
 DECLARE_UNITTEST(TestMaxElementDispatchImplicit);
 
+void TestMaxElementWithBigIndexesHelper(int magnitude)
+{
+    thrust::counting_iterator<long long> begin(1);
+    thrust::counting_iterator<long long> end = begin + (1ll << magnitude);
+    ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude);
+
+    ASSERT_EQUAL(*thrust::max_element(thrust::device, begin, end), (1ll << magnitude));
+}
+
+void TestMaxElementWithBigIndexes()
+{
+    TestMaxElementWithBigIndexesHelper(30);
+    TestMaxElementWithBigIndexesHelper(31);
+    TestMaxElementWithBigIndexesHelper(32);
+    TestMaxElementWithBigIndexesHelper(33);
+}
+DECLARE_UNITTEST(TestMaxElementWithBigIndexes);
diff --git a/testing/memory.cu b/testing/memory.cu
index fde4a16be..e4c1da8f6 100644
--- a/testing/memory.cu
+++ b/testing/memory.cu
@@ -46,6 +46,68 @@ class my_memory_system : public thrust::device_execution_policy<my_memory_system
     my_memory_system();
 };
 
+namespace my_old_namespace
+{
+
+struct my_old_temporary_allocation_system
+  : public thrust::device_execution_policy<my_old_temporary_allocation_system>
+{
+};
+
+template <typename T>
+thrust::pair<thrust::pointer<T, my_old_temporary_allocation_system>, std::ptrdiff_t>
+get_temporary_buffer(my_old_temporary_allocation_system, std::ptrdiff_t)
+{
+  thrust::pointer<T, my_old_temporary_allocation_system> const
+    result(reinterpret_cast<T*>(4217));
+
+  return thrust::make_pair(result, 314);
+}
+
+template<typename Pointer>
+void return_temporary_buffer(my_old_temporary_allocation_system, Pointer p)
+{
+  typedef typename thrust::detail::pointer_traits<Pointer>::raw_pointer RP;
+  ASSERT_EQUAL(p.get(), reinterpret_cast<RP>(4217));
+}
+
+} // my_old_namespace
+
+namespace my_new_namespace
+{
+
+struct my_new_temporary_allocation_system
+  : public thrust::device_execution_policy<my_new_temporary_allocation_system>
+{
+};
+
+template <typename T>
+thrust::pair<thrust::pointer<T, my_new_temporary_allocation_system>, std::ptrdiff_t>
+get_temporary_buffer(my_new_temporary_allocation_system, std::ptrdiff_t)
+{
+  thrust::pointer<T, my_new_temporary_allocation_system> const
+    result(reinterpret_cast<T*>(1742));
+
+  return thrust::make_pair(result, 413);
+}
+
+template<typename Pointer>
+void return_temporary_buffer(my_new_temporary_allocation_system, Pointer)
+{
+  // This should never be called (the three-argument with size overload below
+  // should be preferred) and shouldn't be ambiguous.
+  ASSERT_EQUAL(true, false);
+}
+
+template<typename Pointer>
+void return_temporary_buffer(my_new_temporary_allocation_system, Pointer p, std::ptrdiff_t n)
+{
+  typedef typename thrust::detail::pointer_traits<Pointer>::raw_pointer RP;
+  ASSERT_EQUAL(p.get(), reinterpret_cast<RP>(1742));
+  ASSERT_EQUAL(n, 413);
+}
+
+} // my_new_namespace
 
 template<typename T1, typename T2>
 bool are_same(const T1 &, const T2 &)
@@ -119,7 +181,7 @@ void TestGetTemporaryBuffer()
 
   ASSERT_EQUAL(true, thrust::all_of(ptr_and_sz.first, ptr_and_sz.first + n, thrust::placeholders::_1 == ref_val));
 
-  thrust::return_temporary_buffer(dev_tag, ptr_and_sz.first);
+  thrust::return_temporary_buffer(dev_tag, ptr_and_sz.first, ptr_and_sz.second);
 }
 DECLARE_UNITTEST(TestGetTemporaryBuffer);
 
@@ -198,11 +260,6 @@ template<typename T>
 
 void TestGetTemporaryBufferDispatchExplicit()
 {
-#if defined(THRUST_GCC_VERSION) && (THRUST_GCC_VERSION < 40400)
-  // gcc 4.2 does not do adl correctly for get_temporary_buffer
-  // gcc 4.3 does not do adl correctly for malloc
-  KNOWN_FAILURE;
-#else
   const std::ptrdiff_t n = 9001;
 
   my_memory_system sys(0);
@@ -219,8 +276,7 @@ void TestGetTemporaryBufferDispatchExplicit()
 
   ASSERT_EQUAL(true, thrust::all_of(ptr_and_sz.first, ptr_and_sz.first + n, thrust::placeholders::_1 == ref_val));
 
-  thrust::return_temporary_buffer(sys, ptr_and_sz.first);
-#endif
+  thrust::return_temporary_buffer(sys, ptr_and_sz.first, ptr_and_sz.second);
 }
 DECLARE_UNITTEST(TestGetTemporaryBufferDispatchExplicit);
 
@@ -234,11 +290,6 @@ void TestGetTemporaryBufferDispatchImplicit()
   }
   else
   {
-#if defined(THRUST_GCC_VERSION) && (THRUST_GCC_VERSION < 40400)
-    // gcc 4.2 does not do adl correctly for get_temporary_buffer
-    // gcc 4.3 does not do adl correctly for malloc
-    KNOWN_FAILURE;
-#else
     thrust::device_vector<int> vec(9001);
 
     thrust::sequence(vec.begin(), vec.end());
@@ -250,8 +301,48 @@ void TestGetTemporaryBufferDispatchImplicit()
 
     ASSERT_EQUAL(true, thrust::is_sorted(vec.begin(), vec.end()));
     ASSERT_EQUAL(true, sys.is_valid());
-#endif
   }
 }
 DECLARE_UNITTEST(TestGetTemporaryBufferDispatchImplicit);
 
+
+void TestTemporaryBufferOldCustomization()
+{
+  typedef my_old_namespace::my_old_temporary_allocation_system system;
+  typedef thrust::pointer<int, system> pointer;
+  typedef thrust::pair<pointer, std::ptrdiff_t> pointer_and_size;
+
+  system sys;
+
+  {
+    pointer_and_size ps = thrust::get_temporary_buffer<int>(sys, 0);
+
+    // The magic values are defined in `my_old_namespace` above.
+    ASSERT_EQUAL(ps.first.get(), reinterpret_cast<int*>(4217));
+    ASSERT_EQUAL(ps.second, 314);
+
+    thrust::return_temporary_buffer(sys, ps.first, ps.second);
+  }
+}
+DECLARE_UNITTEST(TestTemporaryBufferOldCustomization);
+
+
+void TestTemporaryBufferNewCustomization()
+{
+  typedef my_new_namespace::my_new_temporary_allocation_system system;
+  typedef thrust::pointer<int, system> pointer;
+  typedef thrust::pair<pointer, std::ptrdiff_t> pointer_and_size;
+
+  system sys;
+
+  {
+    pointer_and_size ps = thrust::get_temporary_buffer<int>(sys, 0);
+
+    // The magic values are defined in `my_new_namespace` above.
+    ASSERT_EQUAL(ps.first.get(), reinterpret_cast<int*>(1742));
+    ASSERT_EQUAL(ps.second, 413);
+
+    thrust::return_temporary_buffer(sys, ps.first, ps.second);
+  }
+}
+DECLARE_UNITTEST(TestTemporaryBufferNewCustomization);
diff --git a/testing/min_element.cu b/testing/min_element.cu
index ec9a4a2e1..81fedbdab 100644
--- a/testing/min_element.cu
+++ b/testing/min_element.cu
@@ -103,3 +103,22 @@ void TestMinElementDispatchImplicit()
 }
 DECLARE_UNITTEST(TestMinElementDispatchImplicit);
 
+void TestMinElementWithBigIndexesHelper(int magnitude)
+{
+    thrust::counting_iterator<long long> begin(1);
+    thrust::counting_iterator<long long> end = begin + (1ll << magnitude);
+    ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude);
+
+    ASSERT_EQUAL(
+        *thrust::min_element(thrust::device, begin, end, thrust::greater<long long>()),
+        (1ll << magnitude));
+}
+
+void TestMinElementWithBigIndexes()
+{
+    TestMinElementWithBigIndexesHelper(30);
+    TestMinElementWithBigIndexesHelper(31);
+    TestMinElementWithBigIndexesHelper(32);
+    TestMinElementWithBigIndexesHelper(33);
+}
+DECLARE_UNITTEST(TestMinElementWithBigIndexes);
diff --git a/testing/minmax_element.cu b/testing/minmax_element.cu
index 3a91b4ad2..4a87f5bb4 100644
--- a/testing/minmax_element.cu
+++ b/testing/minmax_element.cu
@@ -110,3 +110,29 @@ void TestMinMaxElementDispatchImplicit()
 }
 DECLARE_UNITTEST(TestMinMaxElementDispatchImplicit);
 
+void TestMinMaxElementWithBigIndexesHelper(int magnitude)
+{
+    typedef thrust::counting_iterator<long long> Iter;
+    Iter begin(1);
+    Iter end = begin + (1ll << magnitude);
+    ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude);
+
+    thrust::pair<Iter, Iter> result = thrust::minmax_element(
+        thrust::device, begin, end);
+    ASSERT_EQUAL(*result.first, 1);
+    ASSERT_EQUAL(*result.second, (1ll << magnitude));
+
+    result = thrust::minmax_element(thrust::device, begin, end,
+        thrust::greater<long long>());
+    ASSERT_EQUAL(*result.second, 1);
+    ASSERT_EQUAL(*result.first, (1ll << magnitude));
+}
+
+void TestMinMaxElementWithBigIndexes()
+{
+    TestMinMaxElementWithBigIndexesHelper(30);
+    TestMinMaxElementWithBigIndexesHelper(31);
+    TestMinMaxElementWithBigIndexesHelper(32);
+    TestMinMaxElementWithBigIndexesHelper(33);
+}
+DECLARE_UNITTEST(TestMinMaxElementWithBigIndexes);
diff --git a/testing/mr_disjoint_pool.cu b/testing/mr_disjoint_pool.cu
index 883250671..69a6005ec 100644
--- a/testing/mr_disjoint_pool.cu
+++ b/testing/mr_disjoint_pool.cu
@@ -1,8 +1,10 @@
 #include <unittest/unittest.h>
+
+#include <thrust/detail/config.h>
 #include <thrust/mr/disjoint_pool.h>
 #include <thrust/mr/new.h>
 
-#if __cplusplus >= 201103L
+#if THRUST_CPP_DIALECT >= 2011
 #include <thrust/mr/disjoint_sync_pool.h>
 #endif
 
@@ -19,18 +21,19 @@ struct alloc_id
         return id == other.id && size == other.size && alignment == other.alignment;
     }
 
-    alloc_id operator+(std::size_t size) const
+    alloc_id operator+(std::size_t size_) const
     {
         alloc_id ret;
         ret.id = id;
-        ret.size = size;
+        ret.size = size_;
         ret.alignment = alignment;
-        ret.offset = size;
+        ret.offset = size_;
         return ret;
     }
 };
 
-namespace thrust { namespace detail {
+THRUST_NAMESPACE_BEGIN
+namespace detail {
 template<>
 struct pointer_traits<alloc_id>
 {
@@ -46,9 +49,12 @@ struct pointer_traits<alloc_id>
         return reinterpret_cast<void *>(id.alignment);
     }
 };
-}}
 
-class dummy_resource THRUST_FINAL : public thrust::mr::memory_resource<alloc_id>
+} // end namespace detail
+
+THRUST_NAMESPACE_END
+
+class dummy_resource final : public thrust::mr::memory_resource<alloc_id>
 {
 public:
     dummy_resource() : id_to_allocate(0), id_to_deallocate(0)
@@ -61,7 +67,7 @@ public:
         ASSERT_EQUAL(id_to_deallocate, 0u);
     }
 
-    virtual alloc_id do_allocate(std::size_t bytes, std::size_t alignment) THRUST_OVERRIDE
+    virtual alloc_id do_allocate(std::size_t bytes, std::size_t alignment) override
     {
         ASSERT_EQUAL(static_cast<bool>(id_to_allocate), true);
 
@@ -75,7 +81,7 @@ public:
         return ret;
     }
 
-    virtual void do_deallocate(alloc_id p, std::size_t bytes, std::size_t alignment) THRUST_OVERRIDE
+    virtual void do_deallocate(alloc_id p, std::size_t bytes, std::size_t alignment) override
     {
         ASSERT_EQUAL(p.size, bytes);
         ASSERT_EQUAL(p.alignment, alignment);
@@ -177,7 +183,7 @@ void TestDisjointUnsynchronizedPool()
 }
 DECLARE_UNITTEST(TestDisjointUnsynchronizedPool);
 
-#if __cplusplus >= 201103L
+#if THRUST_CPP_DIALECT >= 2011
 void TestDisjointSynchronizedPool()
 {
     TestDisjointPool<thrust::mr::disjoint_synchronized_pool_resource>();
@@ -260,7 +266,7 @@ void TestDisjointUnsynchronizedPoolCachingOversized()
 }
 DECLARE_UNITTEST(TestDisjointUnsynchronizedPoolCachingOversized);
 
-#if __cplusplus >= 201103L
+#if THRUST_CPP_DIALECT >= 2011
 void TestDisjointSynchronizedPoolCachingOversized()
 {
     TestDisjointPoolCachingOversized<thrust::mr::disjoint_synchronized_pool_resource>();
@@ -285,7 +291,7 @@ void TestUnsynchronizedDisjointGlobalPool()
 }
 DECLARE_UNITTEST(TestUnsynchronizedDisjointGlobalPool);
 
-#if __cplusplus >= 201103L
+#if THRUST_CPP_DIALECT >= 2011
 void TestSynchronizedDisjointGlobalPool()
 {
     TestDisjointGlobalPool<thrust::mr::disjoint_synchronized_pool_resource>();
diff --git a/testing/mr_new.cu b/testing/mr_new.cu
index df0f3fde5..02f34eccf 100644
--- a/testing/mr_new.cu
+++ b/testing/mr_new.cu
@@ -9,7 +9,7 @@ void TestAlignment(MemoryResource memres, std::size_t size, std::size_t alignmen
     ASSERT_EQUAL(reinterpret_cast<std::size_t>(ptr) % alignment, 0u);
 
     char * char_ptr = reinterpret_cast<char *>(ptr);
-    thrust::fill(char_ptr, char_ptr + size, 0);
+    thrust::fill(char_ptr, char_ptr + size, char{});
 
     memres.do_deallocate(ptr, size, alignment);
 }
diff --git a/testing/mr_pool.cu b/testing/mr_pool.cu
index bd91c04ea..30c1f18a4 100644
--- a/testing/mr_pool.cu
+++ b/testing/mr_pool.cu
@@ -1,8 +1,10 @@
 #include <unittest/unittest.h>
+
+#include <thrust/detail/config.h>
 #include <thrust/mr/pool.h>
 #include <thrust/mr/new.h>
 
-#if __cplusplus >= 201103L
+#if THRUST_CPP_DIALECT >= 2011
 #include <thrust/mr/sync_pool.h>
 #endif
 
@@ -106,7 +108,7 @@ struct tracked_pointer : thrust::iterator_facade<
     }
 };
 
-class tracked_resource THRUST_FINAL : public thrust::mr::memory_resource<tracked_pointer<void> >
+class tracked_resource final : public thrust::mr::memory_resource<tracked_pointer<void> >
 {
 public:
     tracked_resource() : id_to_allocate(0), id_to_deallocate(0)
@@ -119,7 +121,7 @@ public:
         ASSERT_EQUAL(id_to_deallocate, 0u);
     }
 
-    virtual tracked_pointer<void> do_allocate(std::size_t n, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT) THRUST_OVERRIDE
+    virtual tracked_pointer<void> do_allocate(std::size_t n, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT) override
     {
         ASSERT_EQUAL(static_cast<bool>(id_to_allocate), true);
 
@@ -134,7 +136,7 @@ public:
         return ret;
     }
 
-    virtual void do_deallocate(tracked_pointer<void> p, std::size_t n, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT) THRUST_OVERRIDE
+    virtual void do_deallocate(tracked_pointer<void> p, std::size_t n, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT) override
     {
         ASSERT_EQUAL(p.size, n);
         ASSERT_EQUAL(p.alignment, alignment);
@@ -241,7 +243,7 @@ void TestUnsynchronizedPool()
 }
 DECLARE_UNITTEST(TestUnsynchronizedPool);
 
-#if __cplusplus >= 201103L
+#if THRUST_CPP_DIALECT >= 2011
 void TestSynchronizedPool()
 {
     TestPool<thrust::mr::synchronized_pool_resource>();
@@ -324,7 +326,7 @@ void TestUnsynchronizedPoolCachingOversized()
 }
 DECLARE_UNITTEST(TestUnsynchronizedPoolCachingOversized);
 
-#if __cplusplus >= 201103L
+#if THRUST_CPP_DIALECT >= 2011
 void TestSynchronizedPoolCachingOversized()
 {
     TestPoolCachingOversized<thrust::mr::synchronized_pool_resource>();
@@ -348,7 +350,7 @@ void TestUnsynchronizedGlobalPool()
 }
 DECLARE_UNITTEST(TestUnsynchronizedGlobalPool);
 
-#if __cplusplus >= 201103L
+#if THRUST_CPP_DIALECT >= 2011
 void TestSynchronizedGlobalPool()
 {
     TestGlobalPool<thrust::mr::synchronized_pool_resource>();
diff --git a/testing/namespace_wrapped.cu b/testing/namespace_wrapped.cu
new file mode 100644
index 000000000..b6bcb3dbb
--- /dev/null
+++ b/testing/namespace_wrapped.cu
@@ -0,0 +1,43 @@
+// Wrap thrust and cub in different enclosing namespaces
+// (In practice, you probably want these to be the same, in which case just
+// set THRUST_CUB_WRAPPED_NAMESPACE to set both).
+#define THRUST_WRAPPED_NAMESPACE wrap_thrust
+#define CUB_WRAPPED_NAMESPACE    wrap_cub
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/transform.h>
+
+#include <unittest/unittest.h>
+
+// Test that we can use a few common utilities and algorithms from a wrapped
+// namespace at runtime. More extensive testing is performed by the header
+// tests and the check_namespace.cmake test.
+void TestWrappedNamespace()
+{
+  const std::size_t n = 2048;
+
+  const auto in_1_begin =
+    ::wrap_thrust::thrust::make_constant_iterator<int>(12);
+  const auto in_2_begin =
+    ::wrap_thrust::thrust::make_counting_iterator<int>(1024);
+
+  // Check that the qualifier resolves properly:
+  THRUST_NS_QUALIFIER::device_vector<int> d_out(n);
+
+  ::wrap_thrust::thrust::transform(in_1_begin,
+                                   in_1_begin + n,
+                                   in_2_begin,
+                                   d_out.begin(),
+                                   ::wrap_thrust::thrust::plus<>{});
+
+  ::wrap_thrust::thrust::host_vector<int> h_out(d_out);
+
+  for (std::size_t i = 0; i < n; ++i)
+  {
+    ASSERT_EQUAL(h_out[i], static_cast<int>(i) + 1024 + 12);
+  }
+}
+DECLARE_UNITTEST(TestWrappedNamespace);
diff --git a/testing/omp/CMakeLists.txt b/testing/omp/CMakeLists.txt
new file mode 100644
index 000000000..89ea9bb0c
--- /dev/null
+++ b/testing/omp/CMakeLists.txt
@@ -0,0 +1,18 @@
+file(GLOB test_srcs
+  RELATIVE "${CMAKE_CURRENT_LIST_DIR}}"
+  CONFIGURE_DEPENDS
+  *.cu *.cpp
+)
+
+foreach(thrust_target IN LISTS THRUST_TARGETS)
+  thrust_get_target_property(config_device ${thrust_target} DEVICE)
+  if (NOT config_device STREQUAL "OMP")
+    continue()
+  endif()
+
+  foreach(test_src IN LISTS test_srcs)
+    get_filename_component(test_name "${test_src}" NAME_WLE)
+    string(PREPEND test_name "omp.")
+    thrust_add_test(test_target ${test_name} "${test_src}" ${thrust_target})
+  endforeach()
+endforeach()
diff --git a/testing/out_of_memory_recovery.cu b/testing/out_of_memory_recovery.cu
new file mode 100644
index 000000000..5e4f0c327
--- /dev/null
+++ b/testing/out_of_memory_recovery.cu
@@ -0,0 +1,33 @@
+// Regression test for NVBug 2720132.
+//
+// Summary of 2720132:
+//
+// 1. The large allocation fails due to running out of memory.
+// 2. A `thrust::system::system_error` exception is thrown.
+// 3. Local objects are destroyed as the stack is unwound, leading to the destruction of `x`.
+// 4. `x` runs a parallel algorithm in its destructor to call the destructors of all of its elements.
+// 5. Launching that parallel algorithm fails because of the prior CUDA out of memory error.
+// 6. A `thrust::system::system_error` exception is thrown.
+// 7. Because we've already got an active exception, `terminate` is called.
+
+#include <unittest/unittest.h>
+#include <thrust/device_vector.h>
+#include <thrust/detail/cstdint.h>
+
+struct non_trivial
+{
+  __host__ __device__ non_trivial() {}
+  __host__ __device__ ~non_trivial() {}
+};
+
+void test_out_of_memory_recovery()
+{
+  try
+  {
+    thrust::device_vector<non_trivial> x(1);
+
+    thrust::device_vector<thrust::detail::uint32_t> y(0x00ffffffffffffff);
+  }
+  catch (...) { }
+}
+DECLARE_UNITTEST(test_out_of_memory_recovery);
diff --git a/testing/pair.cu b/testing/pair.cu
index a213265f3..f5f6e92b5 100644
--- a/testing/pair.cu
+++ b/testing/pair.cu
@@ -213,22 +213,42 @@ struct TestPairGet
 };
 SimpleUnitTest<TestPairGet, BuiltinNumericTypes> TestPairGetInstance;
 
+using PairConstVolatileTypes =
+    unittest::type_list<thrust::pair<int, float>, thrust::pair<int, float> const,
+                        thrust::pair<int, float> const volatile>;
 
-void TestPairTupleSize(void)
+template <typename Pair> 
+struct TestPairTupleSize
 {
-  int result = thrust::tuple_size< thrust::pair<int,int> >::value;
-  ASSERT_EQUAL(2, result);
+  void operator()()
+  {
+    ASSERT_EQUAL(2, static_cast<int>(thrust::tuple_size<Pair>::value));
+  }
 };
-DECLARE_UNITTEST(TestPairTupleSize);
+SimpleUnitTest<TestPairTupleSize, PairConstVolatileTypes> TestPairTupleSizeInstance;
 
 
 void TestPairTupleElement(void)
 {
-  typedef thrust::tuple_element<0, thrust::pair<int, float> >::type type0;
-  typedef thrust::tuple_element<1, thrust::pair<int, float> >::type type1;
-
-  ASSERT_EQUAL_QUIET(typeid(int),   typeid(type0));
-  ASSERT_EQUAL_QUIET(typeid(float), typeid(type1));
+  using type0 = thrust::tuple_element<0, thrust::pair<int, float> >::type;
+  using type1 = thrust::tuple_element<1, thrust::pair<int, float> >::type;
+  static_assert(std::is_same<int, type0>::value,"");
+  static_assert(std::is_same<float, type1>::value,"");
+
+  using c_type0 = thrust::tuple_element<0, thrust::pair<int, float> const>::type;
+  using c_type1 = thrust::tuple_element<1, thrust::pair<int, float> const>::type;
+  static_assert(std::is_same<int const, c_type0>::value,"");
+  static_assert(std::is_same<float const, c_type1>::value,"");
+
+  using v_type0 = thrust::tuple_element<0, thrust::pair<int, float> volatile>::type;
+  using v_type1 = thrust::tuple_element<1, thrust::pair<int, float> volatile>::type;
+  static_assert(std::is_same<int volatile, v_type0>::value,"");
+  static_assert(std::is_same<float volatile, v_type1>::value,"");
+
+  using cv_type0 = thrust::tuple_element<0, thrust::pair<int, float> const volatile>::type;
+  using cv_type1 = thrust::tuple_element<1, thrust::pair<int, float> const volatile>::type;
+  static_assert(std::is_same<int const volatile, cv_type0>::value,"");
+  static_assert(std::is_same<float const volatile, cv_type1>::value,"");
 };
 DECLARE_UNITTEST(TestPairTupleElement);
 
diff --git a/testing/pair_reduce.cu b/testing/pair_reduce.cu
index ebdab6597..6682fb3cc 100644
--- a/testing/pair_reduce.cu
+++ b/testing/pair_reduce.cu
@@ -20,7 +20,11 @@ struct add_pairs
   __host__ __device__
     Pair1 operator()(const Pair1 &x, const Pair2 &y)
   {
-    return thrust::make_pair(x.first + y.first, x.second + y.second);
+    // Need cast to undo integer promotion, decltype(char{} + char{}) == int
+    using P1T1 = typename Pair1::first_type;
+    using P1T2 = typename Pair1::second_type;
+    return thrust::make_pair(static_cast<P1T1>(x.first + y.first),
+                             static_cast<P1T2>(x.second + y.second));
   } // end operator()
 }; // end add_pairs
 
@@ -43,7 +47,7 @@ template <typename T>
     thrust::device_vector<T> d_p2 = h_p2;
     thrust::device_vector<P> d_pairs = h_pairs;
 
-    P init = thrust::make_pair(13,13);
+    P init = thrust::make_pair(T{13}, T{13});
 
     // reduce on the host
     P h_result = thrust::reduce(h_pairs.begin(), h_pairs.end(), init, add_pairs());
diff --git a/testing/pair_scan.cu b/testing/pair_scan.cu
index b1bfe064b..5554c6dc4 100644
--- a/testing/pair_scan.cu
+++ b/testing/pair_scan.cu
@@ -61,19 +61,6 @@ template <typename T>
     thrust::inclusive_scan(d_pairs.begin(), d_pairs.end(), d_output.begin(), thrust::maximum<P>());
     ASSERT_EQUAL_QUIET(h_output, d_output);
 
-
-    // The tests below get miscompiled on Tesla hw for 8b types
-
-#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
-    if(const CUDATestDriver *driver = dynamic_cast<const CUDATestDriver*>(&UnitTestDriver::s_driver()))
-    {
-      if(sizeof(T) == sizeof(unittest::uint8_t) && driver->current_device_architecture() < 200)
-      {
-        KNOWN_FAILURE;
-      } // end if
-    } // end if
-#endif
-
     // scan with plus
     thrust::exclusive_scan(h_pairs.begin(), h_pairs.end(), h_output.begin(), init, add_pairs());
     thrust::exclusive_scan(d_pairs.begin(), d_pairs.end(), d_output.begin(), init, add_pairs());
diff --git a/testing/pair_scan_by_key.cu b/testing/pair_scan_by_key.cu
index 6e63bc806..21b53bcbe 100644
--- a/testing/pair_scan_by_key.cu
+++ b/testing/pair_scan_by_key.cu
@@ -20,7 +20,11 @@ struct add_pairs
   __host__ __device__
     Pair1 operator()(const Pair1 &x, const Pair2 &y)
   {
-    return thrust::make_pair(x.first + y.first, x.second + y.second);
+    // Need cast to undo integer promotion, decltype(char{} + char{}) == int
+    using P1T1 = typename Pair1::first_type;
+    using P1T2 = typename Pair1::second_type;
+    return thrust::make_pair(static_cast<P1T1>(x.first + y.first),
+                             static_cast<P1T2>(x.second + y.second));
   } // end operator()
 }; // end add_pairs
 
@@ -46,7 +50,7 @@ template <typename T>
     thrust::host_vector<T>   h_keys = unittest::random_integers<bool>(n);
     thrust::device_vector<T> d_keys = h_keys;
 
-    P init = thrust::make_pair(13,13);
+    P init = thrust::make_pair(T{13}, T{13});
 
     // scan on the host
     thrust::exclusive_scan_by_key(h_keys.begin(), h_keys.end(), h_pairs.begin(), h_pairs.begin(), init, thrust::equal_to<T>(), add_pairs());
diff --git a/testing/partition.cu b/testing/partition.cu
index 742560f59..31aaa9fdd 100644
--- a/testing/partition.cu
+++ b/testing/partition.cu
@@ -6,6 +6,12 @@
 #include <thrust/iterator/retag.h>
 #include <thrust/sort.h>
 
+#if defined(THRUST_GCC_VERSION) && \
+  THRUST_GCC_VERSION >= 110000 && \
+  THRUST_GCC_VERSION < 120000
+#define WAIVE_GCC11_FAILURES
+#endif
+
 template<typename T>
 struct is_even
 {
@@ -21,6 +27,17 @@ void TestPartitionSimple(void)
     typedef typename Vector::value_type T;
     typedef typename Vector::iterator   Iterator;
 
+    // GCC 11 miscompiles and segfaults for certain versions of this test.
+    // It's not reproducible on other compilers, and the test passes when
+    // optimizations are disabled. It only affects 32-bit value types, and
+    // impacts all CPU host/device combinations tested.
+#ifdef WAIVE_GCC11_FAILURES
+    if (sizeof(T) == 4)
+    {
+      return;
+    }
+#endif
+
     Vector data(5);
     data[0] = 1; 
     data[1] = 2; 
@@ -321,6 +338,17 @@ struct TestPartitionStencil
 {
     void operator()(const size_t n)
     {
+        // GCC 11 miscompiles and segfaults for certain versions of this test.
+        // It's not reproducible on other compilers, and the test passes when
+        // optimizations are disabled. It only affects 32-bit value types, and
+        // impacts all CPU host/device combinations tested.
+#ifdef WAIVE_GCC11_FAILURES
+        if (n == 0 && sizeof(T) == 4)
+        {
+          return;
+        }
+#endif
+
         // setup ranges
         thrust::host_vector<T>   h_data = unittest::random_integers<T>(n);
         thrust::host_vector<T>   h_stencil = unittest::random_integers<T>(n);
@@ -684,6 +712,9 @@ struct TestPartitionCopyStencilToDiscardIterator
 VariableUnitTest<TestPartitionCopyStencilToDiscardIterator, PartitionTypes> TestPartitionCopyStencilToDiscardIteratorInstance;
 
 
+// GCC 11 miscompiles and segfaults in this tests.
+#ifndef WAIVE_GCC11_FAILURES
+
 template <typename T>
 struct TestStablePartition
 {
@@ -702,6 +733,11 @@ struct TestStablePartition
 };
 VariableUnitTest<TestStablePartition, PartitionTypes> TestStablePartitionInstance;
 
+#endif // WAIVE_GCC11_FAILURES
+
+
+// GCC 11 miscompiles and segfaults in this tests.
+#ifndef WAIVE_GCC11_FAILURES
 
 template <typename T>
 struct TestStablePartitionStencil
@@ -723,6 +759,8 @@ struct TestStablePartitionStencil
 };
 VariableUnitTest<TestStablePartitionStencil, PartitionTypes> TestStablePartitionStencilInstance;
 
+#endif // WAIVE_GCC11_FAILURES
+
 
 template <typename T>
 struct TestStablePartitionCopy
diff --git a/testing/partition_point.cu b/testing/partition_point.cu
index d93aeac27..bd5a6a8c8 100644
--- a/testing/partition_point.cu
+++ b/testing/partition_point.cu
@@ -95,3 +95,39 @@ void TestPartitionPointDispatchImplicit()
 }
 DECLARE_UNITTEST(TestPartitionPointDispatchImplicit);
 
+struct test_less_than
+{
+    long long expected;
+
+    __device__
+    bool operator()(long long y)
+    {
+        return y < expected;
+    }
+};
+
+void TestPartitionPointWithBigIndexesHelper(int magnitude)
+{
+    thrust::counting_iterator<long long> begin(0);
+    thrust::counting_iterator<long long> end = begin + (1ll << magnitude);
+    ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude);
+
+    test_less_than fn = { (1ll << magnitude) - 17 };
+
+    ASSERT_EQUAL(thrust::distance(
+        begin,
+        thrust::partition_point(
+            thrust::device,
+            begin, end,
+            fn)),
+        (1ll << magnitude) - 17);
+}
+
+void TestPartitionPointWithBigIndexes()
+{
+    TestPartitionPointWithBigIndexesHelper(30);
+    TestPartitionPointWithBigIndexesHelper(31);
+    TestPartitionPointWithBigIndexesHelper(32);
+    TestPartitionPointWithBigIndexesHelper(33);
+}
+DECLARE_UNITTEST(TestPartitionPointWithBigIndexes);
diff --git a/testing/permutation_iterator.cu b/testing/permutation_iterator.cu
index 94f5857c4..22fef650c 100644
--- a/testing/permutation_iterator.cu
+++ b/testing/permutation_iterator.cu
@@ -279,17 +279,20 @@ DECLARE_UNITTEST(TestPermutationIteratorHostDeviceScatter);
 template <typename Vector>
 void TestPermutationIteratorWithCountingIterator(void)
 {
-  typedef typename Vector::value_type T;
+  using T = typename Vector::value_type;
+  using diff_t = typename thrust::counting_iterator<T>::difference_type;
   
-  typename thrust::counting_iterator<T> input(0), index(0);
+  thrust::counting_iterator<T> input(0), index(0);
 
   // test copy()
   {
     Vector output(4,0);
 
-    thrust::copy(thrust::make_permutation_iterator(input, index),
-                 thrust::make_permutation_iterator(input, index + output.size()),
-                 output.begin());
+    auto first = thrust::make_permutation_iterator(input, index);
+    auto last  = thrust::make_permutation_iterator(input,
+                                                   index + static_cast<diff_t>(output.size()));
+
+    thrust::copy(first, last, output.begin());
 
     ASSERT_EQUAL(output[0], 0);
     ASSERT_EQUAL(output[1], 1);
diff --git a/testing/reduce.cu b/testing/reduce.cu
index d9daeee03..cb08bc889 100644
--- a/testing/reduce.cu
+++ b/testing/reduce.cu
@@ -1,6 +1,7 @@
 #include <unittest/unittest.h>
 #include <thrust/reduce.h>
 #include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/retag.h>
 #include <limits>
 
@@ -210,3 +211,22 @@ template<typename T>
 }
 DECLARE_GENERIC_UNITTEST(TestReduceCountingIterator);
 
+void TestReduceWithBigIndexesHelper(int magnitude)
+{
+    thrust::constant_iterator<long long> begin(1);
+    thrust::constant_iterator<long long> end = begin + (1ll << magnitude);
+    ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude);
+
+    long long result = thrust::reduce(thrust::device, begin, end);
+
+    ASSERT_EQUAL(result, 1ll << magnitude);
+}
+
+void TestReduceWithBigIndexes()
+{
+    TestReduceWithBigIndexesHelper(30);
+    TestReduceWithBigIndexesHelper(31);
+    TestReduceWithBigIndexesHelper(32);
+    TestReduceWithBigIndexesHelper(33);
+}
+DECLARE_UNITTEST(TestReduceWithBigIndexes);
diff --git a/testing/reduce_large.cu b/testing/reduce_large.cu
index cfe2d0973..170895ccc 100644
--- a/testing/reduce_large.cu
+++ b/testing/reduce_large.cu
@@ -10,12 +10,14 @@ void _TestReduceWithLargeTypes(void)
     thrust::host_vector< FixedVector<T,N> > h_data(n);
 
     for(size_t i = 0; i < h_data.size(); i++)
-        h_data[i] = FixedVector<T,N>(i);
+    {
+      h_data[i] = FixedVector<T, N>(static_cast<T>(i));
+    }
 
     thrust::device_vector< FixedVector<T,N> > d_data = h_data;
     
-    FixedVector<T,N> h_result = thrust::reduce(h_data.begin(), h_data.end(), FixedVector<T,N>(0));
-    FixedVector<T,N> d_result = thrust::reduce(d_data.begin(), d_data.end(), FixedVector<T,N>(0));
+    FixedVector<T,N> h_result = thrust::reduce(h_data.begin(), h_data.end(), FixedVector<T,N>(T{0}));
+    FixedVector<T,N> d_result = thrust::reduce(d_data.begin(), d_data.end(), FixedVector<T,N>(T{0}));
 
     ASSERT_EQUAL_QUIET(h_result, d_result);
 }
diff --git a/testing/regression/CMakeLists.txt b/testing/regression/CMakeLists.txt
new file mode 100644
index 000000000..eea8b3a45
--- /dev/null
+++ b/testing/regression/CMakeLists.txt
@@ -0,0 +1,20 @@
+#
+# Disabled as these test names are too long for CMAKE_OBJECT_PATH_MAX.
+# We should integrate these with the other unit tests.
+# See issue #1205.
+#
+return()
+
+file(GLOB test_srcs
+  RELATIVE "${CMAKE_CURRENT_LIST_DIR}}"
+  CONFIGURE_DEPENDS
+  *.cu *.cpp
+)
+
+foreach(thrust_target IN LISTS THRUST_TARGETS)
+  foreach(test_src IN LISTS test_srcs)
+    get_filename_component(test_name "${test_src}" NAME_WLE)
+    string(PREPEND test_name "regression.")
+    thrust_add_test(test_target ${test_name} "${test_src}" ${thrust_target})
+  endforeach()
+endforeach()
diff --git a/testing/remove.cu b/testing/remove.cu
index 39adec1af..95b679dc7 100644
--- a/testing/remove.cu
+++ b/testing/remove.cu
@@ -30,14 +30,14 @@ void TestRemoveSimple(void)
     typedef typename Vector::value_type T;
 
     Vector data(5);
-    data[0] =  1; 
-    data[1] =  2; 
+    data[0] =  1;
+    data[1] =  2;
     data[2] =  1;
-    data[3] =  3; 
-    data[4] =  2; 
+    data[3] =  3;
+    data[4] =  2;
 
-    typename Vector::iterator end = thrust::remove(data.begin(), 
-                                                    data.end(), 
+    typename Vector::iterator end = thrust::remove(data.begin(),
+                                                    data.end(),
                                                     (T) 2);
 
     ASSERT_EQUAL(end - data.begin(), 3);
@@ -102,17 +102,17 @@ void TestRemoveCopySimple(void)
     typedef typename Vector::value_type T;
 
     Vector data(5);
-    data[0] =  1; 
-    data[1] =  2; 
+    data[0] =  1;
+    data[1] =  2;
     data[2] =  1;
-    data[3] =  3; 
-    data[4] =  2; 
+    data[3] =  3;
+    data[4] =  2;
 
     Vector result(5);
 
-    typename Vector::iterator end = thrust::remove_copy(data.begin(), 
-                                                        data.end(), 
-                                                        result.begin(), 
+    typename Vector::iterator end = thrust::remove_copy(data.begin(),
+                                                        data.end(),
+                                                        result.begin(),
                                                         (T) 2);
 
     ASSERT_EQUAL(end - result.begin(), 3);
@@ -186,14 +186,14 @@ void TestRemoveIfSimple(void)
     typedef typename Vector::value_type T;
 
     Vector data(5);
-    data[0] =  1; 
-    data[1] =  2; 
+    data[0] =  1;
+    data[1] =  2;
     data[2] =  1;
-    data[3] =  3; 
-    data[4] =  2; 
+    data[3] =  3;
+    data[4] =  2;
 
-    typename Vector::iterator end = thrust::remove_if(data.begin(), 
-                                                      data.end(), 
+    typename Vector::iterator end = thrust::remove_if(data.begin(),
+                                                      data.end(),
                                                       is_even<T>());
 
     ASSERT_EQUAL(end - data.begin(), 3);
@@ -258,11 +258,11 @@ void TestRemoveIfStencilSimple(void)
     typedef typename Vector::value_type T;
 
     Vector data(5);
-    data[0] =  1; 
-    data[1] =  2; 
+    data[0] =  1;
+    data[1] =  2;
     data[2] =  1;
-    data[3] =  3; 
-    data[4] =  2; 
+    data[3] =  3;
+    data[4] =  2;
 
     Vector stencil(5);
     stencil[0] = 0;
@@ -271,7 +271,7 @@ void TestRemoveIfStencilSimple(void)
     stencil[3] = 0;
     stencil[4] = 1;
 
-    typename Vector::iterator end = thrust::remove_if(data.begin(), 
+    typename Vector::iterator end = thrust::remove_if(data.begin(),
                                                       data.end(),
                                                       stencil.begin(),
                                                       thrust::identity<T>());
@@ -347,17 +347,17 @@ void TestRemoveCopyIfSimple(void)
     typedef typename Vector::value_type T;
 
     Vector data(5);
-    data[0] =  1; 
-    data[1] =  2; 
+    data[0] =  1;
+    data[1] =  2;
     data[2] =  1;
-    data[3] =  3; 
-    data[4] =  2; 
+    data[3] =  3;
+    data[4] =  2;
 
     Vector result(5);
 
-    typename Vector::iterator end = thrust::remove_copy_if(data.begin(), 
-                                                           data.end(), 
-                                                           result.begin(), 
+    typename Vector::iterator end = thrust::remove_copy_if(data.begin(),
+                                                           data.end(),
+                                                           result.begin(),
                                                            is_even<T>());
 
     ASSERT_EQUAL(end - result.begin(), 3);
@@ -431,11 +431,11 @@ void TestRemoveCopyIfStencilSimple(void)
     typedef typename Vector::value_type T;
 
     Vector data(5);
-    data[0] =  1; 
-    data[1] =  2; 
+    data[0] =  1;
+    data[1] =  2;
     data[2] =  1;
-    data[3] =  3; 
-    data[4] =  2; 
+    data[3] =  3;
+    data[4] =  2;
 
     Vector stencil(5);
     stencil[0] = 0;
@@ -446,10 +446,10 @@ void TestRemoveCopyIfStencilSimple(void)
 
     Vector result(5);
 
-    typename Vector::iterator end = thrust::remove_copy_if(data.begin(), 
-                                                           data.end(), 
+    typename Vector::iterator end = thrust::remove_copy_if(data.begin(),
+                                                           data.end(),
                                                            stencil.begin(),
-                                                           result.begin(), 
+                                                           result.begin(),
                                                            thrust::identity<T>());
 
     ASSERT_EQUAL(end - result.begin(), 3);
@@ -531,7 +531,7 @@ void TestRemove(const size_t n)
 
     size_t h_size = thrust::remove(h_data.begin(), h_data.end(), T(0)) - h_data.begin();
     size_t d_size = thrust::remove(d_data.begin(), d_data.end(), T(0)) - d_data.begin();
-    
+
     ASSERT_EQUAL(h_size, d_size);
 
     h_data.resize(h_size);
@@ -550,7 +550,7 @@ void TestRemoveIf(const size_t n)
 
     size_t h_size = thrust::remove_if(h_data.begin(), h_data.end(), is_true<T>()) - h_data.begin();
     size_t d_size = thrust::remove_if(d_data.begin(), d_data.end(), is_true<T>()) - d_data.begin();
-   
+
     ASSERT_EQUAL(h_size, d_size);
 
     h_data.resize(h_size);
@@ -569,10 +569,10 @@ void TestRemoveIfStencil(const size_t n)
 
     thrust::host_vector<bool>   h_stencil = unittest::random_integers<bool>(n);
     thrust::device_vector<bool> d_stencil = h_stencil;
-    
+
     size_t h_size = thrust::remove_if(h_data.begin(), h_data.end(), h_stencil.begin(), is_true<T>()) - h_data.begin();
     size_t d_size = thrust::remove_if(d_data.begin(), d_data.end(), d_stencil.begin(), is_true<T>()) - d_data.begin();
-   
+
     ASSERT_EQUAL(h_size, d_size);
 
     h_data.resize(h_size);
@@ -588,13 +588,13 @@ void TestRemoveCopy(const size_t n)
 {
     thrust::host_vector<T>   h_data = unittest::random_samples<T>(n);
     thrust::device_vector<T> d_data = h_data;
-    
+
     thrust::host_vector<T>   h_result(n);
     thrust::device_vector<T> d_result(n);
 
     size_t h_size = thrust::remove_copy(h_data.begin(), h_data.end(), h_result.begin(), T(0)) - h_result.begin();
     size_t d_size = thrust::remove_copy(d_data.begin(), d_data.end(), d_result.begin(), T(0)) - d_result.begin();
-    
+
     ASSERT_EQUAL(h_size, d_size);
 
     h_result.resize(h_size);
@@ -621,7 +621,7 @@ void TestRemoveCopyToDiscardIterator(const size_t n)
       thrust::remove_copy(d_data.begin(), d_data.end(), thrust::make_discard_iterator(), T(0));
 
     thrust::discard_iterator<> reference(num_nonzeros);
-    
+
     ASSERT_EQUAL_QUIET(reference, h_result);
     ASSERT_EQUAL_QUIET(reference, d_result);
 }
@@ -659,7 +659,7 @@ void TestRemoveCopyToDiscardIteratorZipped(const size_t n)
                           thrust::make_tuple(T(0),T(0)));
 
     thrust::discard_iterator<> reference(num_nonzeros);
-    
+
     ASSERT_EQUAL(h_output, d_output);
     ASSERT_EQUAL_QUIET(reference, thrust::get<1>(h_result.get_iterator_tuple()));
     ASSERT_EQUAL_QUIET(reference, thrust::get<1>(d_result.get_iterator_tuple()));
@@ -675,10 +675,10 @@ void TestRemoveCopyIf(const size_t n)
 
     thrust::host_vector<T>   h_result(n);
     thrust::device_vector<T> d_result(n);
-    
+
     size_t h_size = thrust::remove_copy_if(h_data.begin(), h_data.end(), h_result.begin(), is_true<T>()) - h_result.begin();
     size_t d_size = thrust::remove_copy_if(d_data.begin(), d_data.end(), d_result.begin(), is_true<T>()) - d_result.begin();
-    
+
     ASSERT_EQUAL(h_size, d_size);
 
     h_result.resize(h_size);
@@ -716,16 +716,16 @@ void TestRemoveCopyIfStencil(const size_t n)
 {
     thrust::host_vector<T>   h_data = unittest::random_samples<T>(n);
     thrust::device_vector<T> d_data = h_data;
-    
+
     thrust::host_vector<bool>   h_stencil = unittest::random_integers<bool>(n);
     thrust::device_vector<bool> d_stencil = h_stencil;
-    
+
     thrust::host_vector<T>   h_result(n);
     thrust::device_vector<T> d_result(n);
 
     size_t h_size = thrust::remove_copy_if(h_data.begin(), h_data.end(), h_stencil.begin(), h_result.begin(), is_true<T>()) - h_result.begin();
     size_t d_size = thrust::remove_copy_if(d_data.begin(), d_data.end(), d_stencil.begin(), d_result.begin(), is_true<T>()) - d_result.begin();
-    
+
     ASSERT_EQUAL(h_size, d_size);
 
     h_result.resize(h_size);
@@ -741,7 +741,7 @@ void TestRemoveCopyIfStencilToDiscardIterator(const size_t n)
 {
     thrust::host_vector<T>   h_data = unittest::random_samples<T>(n);
     thrust::device_vector<T> d_data = h_data;
-    
+
     thrust::host_vector<bool>   h_stencil = unittest::random_integers<bool>(n);
     thrust::device_vector<bool> d_stencil = h_stencil;
 
@@ -759,4 +759,3 @@ void TestRemoveCopyIfStencilToDiscardIterator(const size_t n)
     ASSERT_EQUAL_QUIET(reference, d_result);
 }
 DECLARE_VARIABLE_UNITTEST(TestRemoveCopyIfStencilToDiscardIterator);
-
diff --git a/testing/replace.cu b/testing/replace.cu
index 31e9890bb..9ba33ddde 100644
--- a/testing/replace.cu
+++ b/testing/replace.cu
@@ -603,8 +603,8 @@ void TestReplaceCopyIf(const size_t n)
     thrust::host_vector<T>   h_dest(n);
     thrust::device_vector<T> d_dest(n);
 
-    thrust::replace_copy_if(h_data.begin(), h_data.end(), h_dest.begin(), less_than_five<T>(), 0);
-    thrust::replace_copy_if(d_data.begin(), d_data.end(), d_dest.begin(), less_than_five<T>(), 0);
+    thrust::replace_copy_if(h_data.begin(), h_data.end(), h_dest.begin(), less_than_five<T>(), T{0});
+    thrust::replace_copy_if(d_data.begin(), d_data.end(), d_dest.begin(), less_than_five<T>(), T{0});
 
     ASSERT_ALMOST_EQUAL(h_data, d_data);
     ASSERT_ALMOST_EQUAL(h_dest, d_dest);
@@ -619,10 +619,10 @@ void TestReplaceCopyIfToDiscardIterator(const size_t n)
     thrust::device_vector<T> d_data = h_data;
 
     thrust::discard_iterator<> h_result =
-      thrust::replace_copy_if(h_data.begin(), h_data.end(), thrust::make_discard_iterator(), less_than_five<T>(), 0);
+      thrust::replace_copy_if(h_data.begin(), h_data.end(), thrust::make_discard_iterator(), less_than_five<T>(), T{0});
 
     thrust::discard_iterator<> d_result =
-      thrust::replace_copy_if(d_data.begin(), d_data.end(), thrust::make_discard_iterator(), less_than_five<T>(), 0);
+      thrust::replace_copy_if(d_data.begin(), d_data.end(), thrust::make_discard_iterator(), less_than_five<T>(), T{0});
 
     thrust::discard_iterator<> reference(n);
 
@@ -643,8 +643,8 @@ void TestReplaceCopyIfStencil(const size_t n)
     thrust::host_vector<T>   h_dest(n);
     thrust::device_vector<T> d_dest(n);
 
-    thrust::replace_copy_if(h_data.begin(), h_data.end(), h_stencil.begin(), h_dest.begin(), less_than_five<T>(), 0);
-    thrust::replace_copy_if(d_data.begin(), d_data.end(), d_stencil.begin(), d_dest.begin(), less_than_five<T>(), 0);
+    thrust::replace_copy_if(h_data.begin(), h_data.end(), h_stencil.begin(), h_dest.begin(), less_than_five<T>(), T{0});
+    thrust::replace_copy_if(d_data.begin(), d_data.end(), d_stencil.begin(), d_dest.begin(), less_than_five<T>(), T{0});
 
     ASSERT_ALMOST_EQUAL(h_data, d_data);
     ASSERT_ALMOST_EQUAL(h_dest, d_dest);
@@ -661,10 +661,10 @@ void TestReplaceCopyIfStencilToDiscardIterator(const size_t n)
     thrust::device_vector<T> d_stencil = h_stencil;
 
     thrust::discard_iterator<> h_result =
-      thrust::replace_copy_if(h_data.begin(), h_data.end(), h_stencil.begin(), thrust::make_discard_iterator(), less_than_five<T>(), 0);
+      thrust::replace_copy_if(h_data.begin(), h_data.end(), h_stencil.begin(), thrust::make_discard_iterator(), less_than_five<T>(), T{0});
 
     thrust::discard_iterator<> d_result =
-      thrust::replace_copy_if(d_data.begin(), d_data.end(), d_stencil.begin(), thrust::make_discard_iterator(), less_than_five<T>(), 0);
+      thrust::replace_copy_if(d_data.begin(), d_data.end(), d_stencil.begin(), thrust::make_discard_iterator(), less_than_five<T>(), T{0});
 
     thrust::discard_iterator<> reference(n);
 
diff --git a/testing/reverse.cu b/testing/reverse.cu
index b04e446dc..1ea4b9b38 100644
--- a/testing/reverse.cu
+++ b/testing/reverse.cu
@@ -73,6 +73,16 @@ DECLARE_UNITTEST(TestReverseDispatchImplicit);
 template<typename Vector>
 void TestReverseCopySimple(void)
 {
+#if THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC && \
+    THRUST_GCC_VERSION >= 80000 && THRUST_GCC_VERSION < 100000
+
+  if (typeid(Vector) == typeid(thrust::host_vector<custom_numeric>))
+  {
+    KNOWN_FAILURE // WAR NVBug 2481122
+  }
+
+#endif
+
   typedef typename Vector::iterator   Iterator;
 
   Vector input(5);
diff --git a/testing/scan.cu b/testing/scan.cu
index 875ed46a9..bceac4038 100644
--- a/testing/scan.cu
+++ b/testing/scan.cu
@@ -1,8 +1,14 @@
 #include <unittest/unittest.h>
+
+#include <thrust/detail/config.h>
+
 #include <thrust/scan.h>
 #include <thrust/functional.h>
 #include <thrust/iterator/discard_iterator.h>
+#include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/retag.h>
+#include <thrust/device_malloc.h>
+#include <thrust/device_free.h>
 
 
 template<typename T>
@@ -20,6 +26,17 @@ template <class Vector>
 void TestScanSimple(void)
 {
     typedef typename Vector::value_type T;
+
+    // icc miscompiles the intermediate sum updates for custom_numeric.
+    // The issue doesn't happen with opts disabled, or on other compilers.
+    // Printing the intermediate sum each iteration "fixes" the issue,
+    // so likely a bad optimization.
+#if THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_INTEL
+    if (std::is_same<T, custom_numeric>::value)
+    {
+      return;
+    }
+#endif
     
     typename Vector::iterator iter;
 
@@ -247,48 +264,49 @@ void TestScanMixedTypes(void)
 
     IntVector   int_output(4);
     FloatVector float_output(4);
-     
-    // float -> int should use using plus<int> operator by default
+
+    // float -> int should use plus<void> operator and float accumulator by default
     thrust::inclusive_scan(float_input.begin(), float_input.end(), int_output.begin());
-    ASSERT_EQUAL(int_output[0],  1);
-    ASSERT_EQUAL(int_output[1],  3);
-    ASSERT_EQUAL(int_output[2],  6);
-    ASSERT_EQUAL(int_output[3], 10);
-    
-    // float -> float with plus<int> operator (int accumulator)
+    ASSERT_EQUAL(int_output[0],  1); // in: 1.5 accum: 1.5f out: 1
+    ASSERT_EQUAL(int_output[1],  4); // in: 2.5 accum: 4.0f out: 4
+    ASSERT_EQUAL(int_output[2],  7); // in: 3.5 accum: 7.5f out: 7
+    ASSERT_EQUAL(int_output[3], 12); // in: 4.5 accum: 12.f out: 12
+
+    // float -> float with plus<int> operator (float accumulator)
     thrust::inclusive_scan(float_input.begin(), float_input.end(), float_output.begin(), thrust::plus<int>());
-    ASSERT_EQUAL(float_output[0],  1.5);
-    ASSERT_EQUAL(float_output[1],  3.0);
-    ASSERT_EQUAL(float_output[2],  6.0);
-    ASSERT_EQUAL(float_output[3], 10.0);
-    
-    // float -> int should use using plus<int> operator by default
+    ASSERT_EQUAL(float_output[0],  1.5f); // in: 1.5 accum: 1.5f out: 1.5f
+    ASSERT_EQUAL(float_output[1],  3.0f); // in: 2.5 accum: 3.0f out: 3.0f
+    ASSERT_EQUAL(float_output[2],  6.0f); // in: 3.5 accum: 6.0f out: 6.0f
+    ASSERT_EQUAL(float_output[3], 10.0f); // in: 4.5 accum: 10.f out: 10.f
+
+    // float -> int should use plus<void> operator and float accumulator by default
     thrust::exclusive_scan(float_input.begin(), float_input.end(), int_output.begin());
-    ASSERT_EQUAL(int_output[0], 0);
-    ASSERT_EQUAL(int_output[1], 1);
-    ASSERT_EQUAL(int_output[2], 3);
-    ASSERT_EQUAL(int_output[3], 6);
-    
-    // float -> int should use using plus<int> operator by default
+    ASSERT_EQUAL(int_output[0], 0); // out: 0.0f  in: 1.5 accum: 1.5f
+    ASSERT_EQUAL(int_output[1], 1); // out: 1.5f  in: 2.5 accum: 4.0f
+    ASSERT_EQUAL(int_output[2], 4); // out: 4.0f  in: 3.5 accum: 7.5f
+    ASSERT_EQUAL(int_output[3], 7); // out: 7.5f  in: 4.5 accum: 12.f
+
+    // float -> int should use plus<> operator and float accumulator by default
     thrust::exclusive_scan(float_input.begin(), float_input.end(), int_output.begin(), (float) 5.5);
-    ASSERT_EQUAL(int_output[0],  5);
-    ASSERT_EQUAL(int_output[1],  7);
-    ASSERT_EQUAL(int_output[2],  9);
-    ASSERT_EQUAL(int_output[3], 13);
-    
-    // int -> float should use using plus<float> operator by default
+    ASSERT_EQUAL(int_output[0],  5); // out: 5.5f  in: 1.5 accum: 7.0f
+    ASSERT_EQUAL(int_output[1],  7); // out: 7.0f  in: 2.5 accum: 9.5f
+    ASSERT_EQUAL(int_output[2],  9); // out: 9.5f  in: 3.5 accum: 13.0f
+    ASSERT_EQUAL(int_output[3], 13); // out: 13.f  in: 4.5 accum: 17.4f
+
+    // int -> float should use using plus<> operator and int accumulator by default
     thrust::inclusive_scan(int_input.begin(), int_input.end(), float_output.begin());
-    ASSERT_EQUAL(float_output[0],  1.0);
-    ASSERT_EQUAL(float_output[1],  3.0);
-    ASSERT_EQUAL(float_output[2],  6.0);
-    ASSERT_EQUAL(float_output[3], 10.0);
-    
-    // int -> float should use using plus<float> operator by default
+    ASSERT_EQUAL(float_output[0],  1.f); // in: 1 accum: 1  out: 1
+    ASSERT_EQUAL(float_output[1],  3.f); // in: 2 accum: 3  out: 3
+    ASSERT_EQUAL(float_output[2],  6.f); // in: 3 accum: 6  out: 6
+    ASSERT_EQUAL(float_output[3], 10.f); // in: 4 accum: 10 out: 10
+
+    // int -> float + float init_value should use using plus<> operator and
+    // float accumulator by default
     thrust::exclusive_scan(int_input.begin(), int_input.end(), float_output.begin(), (float) 5.5);
-    ASSERT_EQUAL(float_output[0],  5.5);
-    ASSERT_EQUAL(float_output[1],  6.5);
-    ASSERT_EQUAL(float_output[2],  8.5);
-    ASSERT_EQUAL(float_output[3], 11.5);
+    ASSERT_EQUAL(float_output[0],  5.5f); // out: 5.5f  in: 1 accum: 6.5f
+    ASSERT_EQUAL(float_output[1],  6.5f); // out: 6.0f  in: 2 accum: 8.5f
+    ASSERT_EQUAL(float_output[2],  8.5f); // out: 8.0f  in: 3 accum: 11.5f
+    ASSERT_EQUAL(float_output[3], 11.5f); // out: 11.f  in: 4 accum: 15.5f
 }
 void TestScanMixedTypesHost(void)
 {
@@ -476,7 +494,9 @@ void _TestScanWithLargeTypes(void)
     thrust::host_vector< FixedVector<T,N> > h_output(n);
 
     for(size_t i = 0; i < h_input.size(); i++)
-        h_input[i] = FixedVector<T,N>(i);
+    {
+        h_input[i] = FixedVector<T, N>(static_cast<T>(i));
+    }
 
     thrust::device_vector< FixedVector<T,N> > d_input = h_input;
     thrust::device_vector< FixedVector<T,N> > d_output(n);
@@ -555,3 +575,165 @@ void TestInclusiveScanWithIndirection(void)
 }
 DECLARE_INTEGRAL_VECTOR_UNITTEST(TestInclusiveScanWithIndirection);
 
+template <typename T>
+struct const_ref_plus_mod3
+{
+    T * table;
+
+    const_ref_plus_mod3(T * table) : table(table) {}
+
+    __host__ __device__
+    const T& operator()(T a, T b)
+    {
+        return table[(int) (a + b)];
+    }
+};
+
+template <typename Vector>
+void TestInclusiveScanWithConstAccumulator(void)
+{
+    // add numbers modulo 3 with external lookup table
+    typedef typename Vector::value_type T;
+
+    Vector data(7);
+    data[0] = 0;
+    data[1] = 1;
+    data[2] = 2;
+    data[3] = 1;
+    data[4] = 2;
+    data[5] = 0;
+    data[6] = 1;
+
+    Vector table(6);
+    table[0] = 0;
+    table[1] = 1;
+    table[2] = 2;
+    table[3] = 0;
+    table[4] = 1;
+    table[5] = 2;
+
+    thrust::inclusive_scan(data.begin(), data.end(), data.begin(), const_ref_plus_mod3<T>(thrust::raw_pointer_cast(&table[0])));
+    
+    ASSERT_EQUAL(data[0], T(0));
+    ASSERT_EQUAL(data[1], T(1));
+    ASSERT_EQUAL(data[2], T(0));
+    ASSERT_EQUAL(data[3], T(1));
+    ASSERT_EQUAL(data[4], T(0));
+    ASSERT_EQUAL(data[5], T(0));
+    ASSERT_EQUAL(data[6], T(1));
+}
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestInclusiveScanWithConstAccumulator);
+
+struct only_set_when_expected_it
+{
+    long long expected;
+    bool * flag;
+
+    __host__ __device__ only_set_when_expected_it operator++() const { return *this; }
+    __host__ __device__ only_set_when_expected_it operator*() const { return *this; }
+    template<typename Difference>
+    __host__ __device__ only_set_when_expected_it operator+(Difference) const { return *this; }
+    template<typename Index>
+    __host__ __device__ only_set_when_expected_it operator[](Index) const { return *this; }
+
+    __device__
+    void operator=(long long value) const
+    {
+        if (value == expected)
+        {
+            *flag = true;
+        }
+    }
+};
+
+THRUST_NAMESPACE_BEGIN
+template<>
+struct iterator_traits<only_set_when_expected_it>
+{
+    typedef long long value_type;
+    typedef only_set_when_expected_it reference;
+};
+THRUST_NAMESPACE_END
+
+void TestInclusiveScanWithBigIndexesHelper(int magnitude)
+{
+    thrust::constant_iterator<long long> begin(1);
+    thrust::constant_iterator<long long> end = begin + (1ll << magnitude);
+    ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude);
+
+    thrust::device_ptr<bool> has_executed = thrust::device_malloc<bool>(1);
+    *has_executed = false;
+
+    only_set_when_expected_it out = { (1ll << magnitude), thrust::raw_pointer_cast(has_executed) };
+
+    thrust::inclusive_scan(thrust::device, begin, end, out);
+
+    bool has_executed_h = *has_executed;
+    thrust::device_free(has_executed);
+
+    ASSERT_EQUAL(has_executed_h, true);
+}
+
+void TestInclusiveScanWithBigIndexes()
+{
+  TestInclusiveScanWithBigIndexesHelper(30);
+  TestInclusiveScanWithBigIndexesHelper(31);
+  TestInclusiveScanWithBigIndexesHelper(32);
+  TestInclusiveScanWithBigIndexesHelper(33);
+}
+
+DECLARE_UNITTEST(TestInclusiveScanWithBigIndexes);
+
+void TestExclusiveScanWithBigIndexesHelper(int magnitude)
+{
+    thrust::constant_iterator<long long> begin(1);
+    thrust::constant_iterator<long long> end = begin + (1ll << magnitude);
+    ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude);
+
+    thrust::device_ptr<bool> has_executed = thrust::device_malloc<bool>(1);
+    *has_executed = false;
+
+    only_set_when_expected_it out = { (1ll << magnitude) - 1, thrust::raw_pointer_cast(has_executed) };
+
+    thrust::exclusive_scan(thrust::device, begin, end, out,0ll);
+
+    bool has_executed_h = *has_executed;
+    thrust::device_free(has_executed);
+
+    ASSERT_EQUAL(has_executed_h, true);
+}
+
+void TestExclusiveScanWithBigIndexes()
+{
+  TestExclusiveScanWithBigIndexesHelper(30);
+  TestExclusiveScanWithBigIndexesHelper(31);
+  TestExclusiveScanWithBigIndexesHelper(32);
+  TestExclusiveScanWithBigIndexesHelper(33);
+}
+
+DECLARE_UNITTEST(TestExclusiveScanWithBigIndexes);
+
+#if THRUST_CPP_DIALECT >= 2011
+
+struct Int {
+    int i{};
+    __host__ __device__ explicit Int(int num) : i(num) {}
+    __host__ __device__ Int() : i{} {}
+    __host__ __device__ Int operator+(Int const& o) const { return Int{this->i + o.i}; }
+};
+
+void TestInclusiveScanWithUserDefinedType()
+{
+    thrust::device_vector<Int> vec(5, Int{1});
+
+    thrust::inclusive_scan(
+        thrust::device,
+        vec.cbegin(),
+        vec.cend(),
+        vec.begin());
+
+    ASSERT_EQUAL(static_cast<Int>(vec.back()).i, 5);
+}
+DECLARE_UNITTEST(TestInclusiveScanWithUserDefinedType);
+
+#endif // c++11
diff --git a/testing/scan_by_key.cu b/testing/scan_by_key.cu
deleted file mode 100644
index efc48bdb4..000000000
--- a/testing/scan_by_key.cu
+++ /dev/null
@@ -1,629 +0,0 @@
-#include <unittest/unittest.h>
-#include <thrust/scan.h>
-#include <thrust/functional.h>
-#include <thrust/iterator/transform_iterator.h>
-#include <thrust/iterator/retag.h>
-#include <thrust/random.h>
-
-
-template <typename Vector>
-void TestInclusiveScanByKeySimple(void)
-{
-    typedef typename Vector::value_type T;
-    typedef typename Vector::iterator   Iterator;
-
-    Vector keys(7);
-    Vector vals(7);
-
-    Vector output(7, 0);
-
-    keys[0] = 0; vals[0] = 1;
-    keys[1] = 1; vals[1] = 2;
-    keys[2] = 1; vals[2] = 3;
-    keys[3] = 1; vals[3] = 4;
-    keys[4] = 2; vals[4] = 5;
-    keys[5] = 3; vals[5] = 6;
-    keys[6] = 3; vals[6] = 7;
-
-    Iterator iter = thrust::inclusive_scan_by_key(keys.begin(), keys.end(), vals.begin(), output.begin());
-
-    ASSERT_EQUAL_QUIET(iter, output.end());
-
-    ASSERT_EQUAL(output[0],  1);
-    ASSERT_EQUAL(output[1],  2);
-    ASSERT_EQUAL(output[2],  5);
-    ASSERT_EQUAL(output[3],  9);
-    ASSERT_EQUAL(output[4],  5);
-    ASSERT_EQUAL(output[5],  6);
-    ASSERT_EQUAL(output[6], 13);
-    
-    thrust::inclusive_scan_by_key(keys.begin(), keys.end(), vals.begin(), output.begin(), thrust::equal_to<T>(), thrust::multiplies<T>());
-
-    ASSERT_EQUAL(output[0],  1);
-    ASSERT_EQUAL(output[1],  2);
-    ASSERT_EQUAL(output[2],  6);
-    ASSERT_EQUAL(output[3], 24);
-    ASSERT_EQUAL(output[4],  5);
-    ASSERT_EQUAL(output[5],  6);
-    ASSERT_EQUAL(output[6], 42);
-    
-    thrust::inclusive_scan_by_key(keys.begin(), keys.end(), vals.begin(), output.begin(), thrust::equal_to<T>());
-
-    ASSERT_EQUAL(output[0],  1);
-    ASSERT_EQUAL(output[1],  2);
-    ASSERT_EQUAL(output[2],  5);
-    ASSERT_EQUAL(output[3],  9);
-    ASSERT_EQUAL(output[4],  5);
-    ASSERT_EQUAL(output[5],  6);
-    ASSERT_EQUAL(output[6], 13);
-}
-DECLARE_VECTOR_UNITTEST(TestInclusiveScanByKeySimple);
-
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator>
-OutputIterator inclusive_scan_by_key(my_system &system,
-                                     InputIterator1,
-                                     InputIterator1,
-                                     InputIterator2,
-                                     OutputIterator result)
-{
-    system.validate_dispatch();
-    return result;
-}
-
-void TestInclusiveScanByKeyDispatchExplicit()
-{
-    thrust::device_vector<int> vec(1);
-
-    my_system sys(0);
-    thrust::inclusive_scan_by_key(sys,
-                                  vec.begin(),
-                                  vec.begin(),
-                                  vec.begin(),
-                                  vec.begin());
-
-    ASSERT_EQUAL(true, sys.is_valid());
-}
-DECLARE_UNITTEST(TestInclusiveScanByKeyDispatchExplicit);
-
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator>
-OutputIterator inclusive_scan_by_key(my_tag,
-                                     InputIterator1,
-                                     InputIterator1,
-                                     InputIterator2,
-                                     OutputIterator result)
-{
-    *result = 13;
-    return result;
-}
-
-void TestInclusiveScanByKeyDispatchImplicit()
-{
-    thrust::device_vector<int> vec(1);
-
-    thrust::inclusive_scan_by_key(thrust::retag<my_tag>(vec.begin()),
-                                  thrust::retag<my_tag>(vec.begin()),
-                                  thrust::retag<my_tag>(vec.begin()),
-                                  thrust::retag<my_tag>(vec.begin()));
-
-    ASSERT_EQUAL(13, vec.front());
-}
-DECLARE_UNITTEST(TestInclusiveScanByKeyDispatchImplicit);
-
-
-template <typename Vector>
-void TestExclusiveScanByKeySimple(void)
-{
-    typedef typename Vector::value_type T;
-    typedef typename Vector::iterator   Iterator;
-
-    Vector keys(7);
-    Vector vals(7);
-
-    Vector output(7, 0);
-
-    keys[0] = 0; vals[0] = 1;
-    keys[1] = 1; vals[1] = 2;
-    keys[2] = 1; vals[2] = 3;
-    keys[3] = 1; vals[3] = 4;
-    keys[4] = 2; vals[4] = 5;
-    keys[5] = 3; vals[5] = 6;
-    keys[6] = 3; vals[6] = 7;
-    
-    Iterator iter = thrust::exclusive_scan_by_key(keys.begin(), keys.end(), vals.begin(), output.begin());
-
-    ASSERT_EQUAL_QUIET(iter, output.end());
-
-    ASSERT_EQUAL(output[0], 0);
-    ASSERT_EQUAL(output[1], 0);
-    ASSERT_EQUAL(output[2], 2);
-    ASSERT_EQUAL(output[3], 5);
-    ASSERT_EQUAL(output[4], 0);
-    ASSERT_EQUAL(output[5], 0);
-    ASSERT_EQUAL(output[6], 6);
-
-    thrust::exclusive_scan_by_key(keys.begin(), keys.end(), vals.begin(), output.begin(), T(10));
-
-    ASSERT_EQUAL(output[0], 10);
-    ASSERT_EQUAL(output[1], 10);
-    ASSERT_EQUAL(output[2], 12);
-    ASSERT_EQUAL(output[3], 15);
-    ASSERT_EQUAL(output[4], 10);
-    ASSERT_EQUAL(output[5], 10);
-    ASSERT_EQUAL(output[6], 16);
-    
-    thrust::exclusive_scan_by_key(keys.begin(), keys.end(), vals.begin(), output.begin(), T(10), thrust::equal_to<T>(), thrust::multiplies<T>());
-
-    ASSERT_EQUAL(output[0], 10);
-    ASSERT_EQUAL(output[1], 10);
-    ASSERT_EQUAL(output[2], 20);
-    ASSERT_EQUAL(output[3], 60);
-    ASSERT_EQUAL(output[4], 10);
-    ASSERT_EQUAL(output[5], 10);
-    ASSERT_EQUAL(output[6], 60);
-    
-    thrust::exclusive_scan_by_key(keys.begin(), keys.end(), vals.begin(), output.begin(), T(10), thrust::equal_to<T>());
-
-    ASSERT_EQUAL(output[0], 10);
-    ASSERT_EQUAL(output[1], 10);
-    ASSERT_EQUAL(output[2], 12);
-    ASSERT_EQUAL(output[3], 15);
-    ASSERT_EQUAL(output[4], 10);
-    ASSERT_EQUAL(output[5], 10);
-    ASSERT_EQUAL(output[6], 16);
-}
-DECLARE_VECTOR_UNITTEST(TestExclusiveScanByKeySimple);
-
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator>
-OutputIterator exclusive_scan_by_key(my_system &system,
-                                     InputIterator1,
-                                     InputIterator1,
-                                     InputIterator2,
-                                     OutputIterator result)
-{
-    system.validate_dispatch();
-    return result;
-}
-
-void TestExclusiveScanByKeyDispatchExplicit()
-{
-    thrust::device_vector<int> vec(1);
-
-    my_system sys(0);
-    thrust::exclusive_scan_by_key(sys,
-                                  vec.begin(),
-                                  vec.begin(),
-                                  vec.begin(),
-                                  vec.begin());
-
-    ASSERT_EQUAL(true, sys.is_valid());
-}
-DECLARE_UNITTEST(TestExclusiveScanByKeyDispatchExplicit);
-
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator>
-OutputIterator exclusive_scan_by_key(my_tag,
-                                     InputIterator1,
-                                     InputIterator1,
-                                     InputIterator2,
-                                     OutputIterator result)
-{
-    *result = 13;
-    return result;
-}
-
-void TestExclusiveScanByKeyDispatchImplicit()
-{
-    thrust::device_vector<int> vec(1);
-
-    thrust::exclusive_scan_by_key(thrust::retag<my_tag>(vec.begin()),
-                                  thrust::retag<my_tag>(vec.begin()),
-                                  thrust::retag<my_tag>(vec.begin()),
-                                  thrust::retag<my_tag>(vec.begin()));
-
-    ASSERT_EQUAL(13, vec.front());
-}
-DECLARE_UNITTEST(TestExclusiveScanByKeyDispatchImplicit);
-
-
-struct head_flag_predicate
-{
-    template <typename T>
-    __host__ __device__
-    bool operator()(const T&, const T& b)
-    {
-        return b ? false : true;
-    }
-};
-
-template <typename Vector>
-void TestScanByKeyHeadFlags(void)
-{
-    typedef typename Vector::value_type T;
-
-    Vector keys(7);
-    Vector vals(7);
-
-    Vector output(7, 0);
-
-    keys[0] = 0; vals[0] = 1;
-    keys[1] = 1; vals[1] = 2;
-    keys[2] = 0; vals[2] = 3;
-    keys[3] = 0; vals[3] = 4;
-    keys[4] = 1; vals[4] = 5;
-    keys[5] = 1; vals[5] = 6;
-    keys[6] = 0; vals[6] = 7;
-    
-    thrust::inclusive_scan_by_key(keys.begin(), keys.end(), vals.begin(), output.begin(), head_flag_predicate(), thrust::plus<T>());
-
-    ASSERT_EQUAL(output[0],  1);
-    ASSERT_EQUAL(output[1],  2);
-    ASSERT_EQUAL(output[2],  5);
-    ASSERT_EQUAL(output[3],  9);
-    ASSERT_EQUAL(output[4],  5);
-    ASSERT_EQUAL(output[5],  6);
-    ASSERT_EQUAL(output[6], 13);
-
-    thrust::exclusive_scan_by_key(keys.begin(), keys.end(), vals.begin(), output.begin(), T(10), head_flag_predicate(), thrust::plus<T>());
-    
-    ASSERT_EQUAL(output[0], 10);
-    ASSERT_EQUAL(output[1], 10);
-    ASSERT_EQUAL(output[2], 12);
-    ASSERT_EQUAL(output[3], 15);
-    ASSERT_EQUAL(output[4], 10);
-    ASSERT_EQUAL(output[5], 10);
-    ASSERT_EQUAL(output[6], 16);
-}
-DECLARE_VECTOR_UNITTEST(TestScanByKeyHeadFlags);
-
-template <typename Vector>
-void TestInclusiveScanByKeyTransformIterator(void)
-{
-    typedef typename Vector::value_type T;
-
-    Vector keys(7);
-    Vector vals(7);
-
-    Vector output(7, 0);
-
-    keys[0] = 0; vals[0] = 1;
-    keys[1] = 1; vals[1] = 2;
-    keys[2] = 1; vals[2] = 3;
-    keys[3] = 1; vals[3] = 4;
-    keys[4] = 2; vals[4] = 5;
-    keys[5] = 3; vals[5] = 6;
-    keys[6] = 3; vals[6] = 7;
-
-    thrust::inclusive_scan_by_key
-        (keys.begin(), keys.end(),
-         thrust::make_transform_iterator(vals.begin(), thrust::negate<T>()), 
-         output.begin());
-    
-    ASSERT_EQUAL(output[0],  -1);
-    ASSERT_EQUAL(output[1],  -2);
-    ASSERT_EQUAL(output[2],  -5);
-    ASSERT_EQUAL(output[3],  -9);
-    ASSERT_EQUAL(output[4],  -5);
-    ASSERT_EQUAL(output[5],  -6);
-    ASSERT_EQUAL(output[6], -13);
-}
-DECLARE_VECTOR_UNITTEST(TestInclusiveScanByKeyTransformIterator);
-
-
-template <typename Vector>
-void TestScanByKeyReusedKeys(void)
-{
-    Vector keys(7);
-    Vector vals(7);
-
-    Vector output(7, 0);
-
-    keys[0] = 0; vals[0] = 1;
-    keys[1] = 1; vals[1] = 2;
-    keys[2] = 1; vals[2] = 3;
-    keys[3] = 1; vals[3] = 4;
-    keys[4] = 0; vals[4] = 5;
-    keys[5] = 1; vals[5] = 6;
-    keys[6] = 1; vals[6] = 7;
-    
-    thrust::inclusive_scan_by_key(keys.begin(), keys.end(), vals.begin(), output.begin());
-
-    ASSERT_EQUAL(output[0],  1);
-    ASSERT_EQUAL(output[1],  2);
-    ASSERT_EQUAL(output[2],  5);
-    ASSERT_EQUAL(output[3],  9);
-    ASSERT_EQUAL(output[4],  5);
-    ASSERT_EQUAL(output[5],  6);
-    ASSERT_EQUAL(output[6], 13);
-
-    thrust::exclusive_scan_by_key(keys.begin(), keys.end(), vals.begin(), output.begin(), typename Vector::value_type(10));
-    
-    ASSERT_EQUAL(output[0], 10);
-    ASSERT_EQUAL(output[1], 10);
-    ASSERT_EQUAL(output[2], 12);
-    ASSERT_EQUAL(output[3], 15);
-    ASSERT_EQUAL(output[4], 10);
-    ASSERT_EQUAL(output[5], 10);
-    ASSERT_EQUAL(output[6], 16);
-}
-DECLARE_VECTOR_UNITTEST(TestScanByKeyReusedKeys);
-
-
-template <typename T>
-void TestInclusiveScanByKey(const size_t n)
-{
-    // XXX WAR nvbug 1541533
-#if 0 //THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC
-    if(typeid(T) == typeid(char) ||
-       typeid(T) == typeid(unsigned char))
-    {
-      KNOWN_FAILURE;
-    }
-#endif
-
-    thrust::host_vector<int> h_keys(n);
-    thrust::default_random_engine rng;
-    for(size_t i = 0, k = 0; i < n; i++){
-        h_keys[i] = k;
-        if (rng() % 10 == 0)
-            k++;
-    }
-    thrust::device_vector<int> d_keys = h_keys;
-
-    thrust::host_vector<T>   h_vals = unittest::random_integers<int>(n);
-    for(size_t i = 0; i < n; i++)
-        h_vals[i] = i % 10;
-    thrust::device_vector<T> d_vals = h_vals;
-
-    thrust::host_vector<T>   h_output(n);
-    thrust::device_vector<T> d_output(n);
-   
-    thrust::inclusive_scan_by_key(h_keys.begin(), h_keys.end(), h_vals.begin(), h_output.begin());
-    thrust::inclusive_scan_by_key(d_keys.begin(), d_keys.end(), d_vals.begin(), d_output.begin());
-    ASSERT_EQUAL(d_output, h_output);
-}
-DECLARE_VARIABLE_UNITTEST(TestInclusiveScanByKey);
-
-
-template <typename T>
-void TestExclusiveScanByKey(const size_t n)
-{
-    thrust::host_vector<int> h_keys(n);
-    thrust::default_random_engine rng;
-    for(size_t i = 0, k = 0; i < n; i++){
-        h_keys[i] = k;
-        if (rng() % 10 == 0)
-            k++;
-    }
-    thrust::device_vector<int> d_keys = h_keys;
-
-    thrust::host_vector<T>   h_vals = unittest::random_integers<int>(n);
-    for(size_t i = 0; i < n; i++)
-        h_vals[i] = i % 10;
-    thrust::device_vector<T> d_vals = h_vals;
-
-    thrust::host_vector<T>   h_output(n);
-    thrust::device_vector<T> d_output(n);
-   
-    // without init
-    thrust::exclusive_scan_by_key(h_keys.begin(), h_keys.end(), h_vals.begin(), h_output.begin());
-    thrust::exclusive_scan_by_key(d_keys.begin(), d_keys.end(), d_vals.begin(), d_output.begin());
-    ASSERT_EQUAL(d_output, h_output);
-    
-    // with init
-    thrust::exclusive_scan_by_key(h_keys.begin(), h_keys.end(), h_vals.begin(), h_output.begin(), (T) 11);
-    thrust::exclusive_scan_by_key(d_keys.begin(), d_keys.end(), d_vals.begin(), d_output.begin(), (T) 11);
-    ASSERT_EQUAL(d_output, h_output);
-}
-DECLARE_VARIABLE_UNITTEST(TestExclusiveScanByKey);
-
-
-template <typename T>
-void TestInclusiveScanByKeyInPlace(const size_t n)
-{
-    // XXX WAR nvbug 1541533
-#if 0 //THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC
-    if(typeid(T) == typeid(char) ||
-       typeid(T) == typeid(unsigned char))
-    {
-      KNOWN_FAILURE;
-    }
-#endif
-
-    thrust::host_vector<int> h_keys(n);
-    thrust::default_random_engine rng;
-    for(size_t i = 0, k = 0; i < n; i++){
-        h_keys[i] = k;
-        if (rng() % 10 == 0)
-            k++;
-    }
-    thrust::device_vector<int> d_keys = h_keys;
-
-    thrust::host_vector<T>   h_vals = unittest::random_integers<int>(n);
-    for(size_t i = 0; i < n; i++)
-        h_vals[i] = i % 10;
-    thrust::device_vector<T> d_vals = h_vals;
-
-    thrust::host_vector<T>   h_output(n);
-    thrust::device_vector<T> d_output(n);
-   
-    // in-place scans
-    h_output = h_vals;
-    d_output = d_vals;
-    thrust::inclusive_scan_by_key(h_keys.begin(), h_keys.end(), h_output.begin(), h_output.begin());
-    thrust::inclusive_scan_by_key(d_keys.begin(), d_keys.end(), d_output.begin(), d_output.begin());
-    ASSERT_EQUAL(d_output, h_output);
-}
-DECLARE_VARIABLE_UNITTEST(TestInclusiveScanByKeyInPlace);
-
-
-template <typename T>
-void TestExclusiveScanByKeyInPlace(const size_t n)
-{
-    thrust::host_vector<int> h_keys(n);
-    thrust::default_random_engine rng;
-    for(size_t i = 0, k = 0; i < n; i++){
-        h_keys[i] = k;
-        if (rng() % 10 == 0)
-            k++;
-    }
-    thrust::device_vector<int> d_keys = h_keys;
-
-    thrust::host_vector<T>   h_vals = unittest::random_integers<int>(n);
-    for(size_t i = 0; i < n; i++)
-        h_vals[i] = i % 10;
-    thrust::device_vector<T> d_vals = h_vals;
-
-    thrust::host_vector<T>   h_output = h_vals;
-    thrust::device_vector<T> d_output = d_vals;
-    thrust::exclusive_scan_by_key(h_keys.begin(), h_keys.end(), h_output.begin(), h_output.begin(), (T) 11);
-    thrust::exclusive_scan_by_key(d_keys.begin(), d_keys.end(), d_output.begin(), d_output.begin(), (T) 11);
-    ASSERT_EQUAL(d_output, h_output);
-}
-DECLARE_VARIABLE_UNITTEST(TestExclusiveScanByKeyInPlace);
-
-
-void TestScanByKeyMixedTypes(void)
-{
-    const unsigned int n = 113;
-    
-    thrust::host_vector<int> h_keys(n);
-    thrust::default_random_engine rng;
-    for(size_t i = 0, k = 0; i < n; i++){
-        h_keys[i] = k;
-        if (rng() % 10 == 0)
-            k++;
-    }
-    thrust::device_vector<int> d_keys = h_keys;
-
-    thrust::host_vector<unsigned int> h_vals = unittest::random_integers<unsigned int>(n);
-    for(size_t i = 0; i < n; i++)
-        h_vals[i] %= 10;
-    thrust::device_vector<unsigned int> d_vals = h_vals;
-
-    thrust::host_vector<float>   h_float_output(n);
-    thrust::device_vector<float> d_float_output(n);
-    thrust::host_vector<int>   h_int_output(n);
-    thrust::device_vector<int> d_int_output(n);
-
-    //mixed vals/output types
-    thrust::inclusive_scan_by_key(h_keys.begin(), h_keys.end(), h_vals.begin(), h_float_output.begin());
-    thrust::inclusive_scan_by_key(d_keys.begin(), d_keys.end(), d_vals.begin(), d_float_output.begin());
-    ASSERT_EQUAL(d_float_output, h_float_output);
-    
-    thrust::exclusive_scan_by_key(h_keys.begin(), h_keys.end(), h_vals.begin(), h_float_output.begin(), (float) 3.5);
-    thrust::exclusive_scan_by_key(d_keys.begin(), d_keys.end(), d_vals.begin(), d_float_output.begin(), (float) 3.5);
-    ASSERT_EQUAL(d_float_output, h_float_output);
-    
-    thrust::exclusive_scan_by_key(h_keys.begin(), h_keys.end(), h_vals.begin(), h_float_output.begin(), (int) 3);
-    thrust::exclusive_scan_by_key(d_keys.begin(), d_keys.end(), d_vals.begin(), d_float_output.begin(), (int) 3);
-    ASSERT_EQUAL(d_float_output, h_float_output);
-    
-    thrust::exclusive_scan_by_key(h_keys.begin(), h_keys.end(), h_vals.begin(), h_int_output.begin(), (int) 3);
-    thrust::exclusive_scan_by_key(d_keys.begin(), d_keys.end(), d_vals.begin(), d_int_output.begin(), (int) 3);
-    ASSERT_EQUAL(d_int_output, h_int_output);
-    
-    thrust::exclusive_scan_by_key(h_keys.begin(), h_keys.end(), h_vals.begin(), h_int_output.begin(), (float) 3.5);
-    thrust::exclusive_scan_by_key(d_keys.begin(), d_keys.end(), d_vals.begin(), d_int_output.begin(), (float) 3.5);
-    ASSERT_EQUAL(d_int_output, h_int_output);
-}
-DECLARE_UNITTEST(TestScanByKeyMixedTypes);
-
-
-void TestScanByKeyLargeInput()
-{
-    const unsigned int N = 1 << 20;
-
-    thrust::host_vector<unsigned int> vals_sizes = unittest::random_integers<unsigned int>(10);
-        
-    thrust::host_vector<unsigned int>   h_vals = unittest::random_integers<unsigned int>(N);
-    thrust::device_vector<unsigned int> d_vals = h_vals;
-
-    thrust::host_vector<unsigned int>   h_output(N, 0);
-    thrust::device_vector<unsigned int> d_output(N, 0);
-
-    for (unsigned int i = 0; i < vals_sizes.size(); i++)
-    {
-        const unsigned int n = vals_sizes[i] % N;
-
-        // define segments
-        thrust::host_vector<unsigned int> h_keys(n);
-        thrust::default_random_engine rng;
-        for(size_t i = 0, k = 0; i < n; i++){
-            h_keys[i] = k;
-            if (rng() % 100 == 0)
-                k++;
-        }
-        thrust::device_vector<unsigned int> d_keys = h_keys;
-    
-        thrust::inclusive_scan_by_key(h_keys.begin(), h_keys.begin() + n, h_vals.begin(), h_output.begin());
-        thrust::inclusive_scan_by_key(d_keys.begin(), d_keys.begin() + n, d_vals.begin(), d_output.begin());
-        ASSERT_EQUAL(d_output, h_output);
-
-        thrust::inclusive_scan_by_key(h_keys.begin(), h_keys.begin() + n, h_vals.begin(), h_output.begin());
-        thrust::inclusive_scan_by_key(d_keys.begin(), d_keys.begin() + n, d_vals.begin(), d_output.begin());
-        ASSERT_EQUAL(d_output, h_output);
-   }
-}
-DECLARE_UNITTEST(TestScanByKeyLargeInput);
-
-
-template <typename T, unsigned int N>
-void _TestScanByKeyWithLargeTypes(void)
-{
-    size_t n = (64 * 1024) / sizeof(FixedVector<T,N>);
-
-    thrust::host_vector<   unsigned int   > h_keys(n);
-    thrust::host_vector< FixedVector<T,N> > h_vals(n);
-    thrust::host_vector< FixedVector<T,N> > h_output(n);
-
-    thrust::default_random_engine rng;
-    for(size_t i = 0, k = 0; i < h_vals.size(); i++)
-    {
-        h_vals[i] = FixedVector<T,N>(i);
-        h_keys[i]  = k;
-        if (rng() % 5 == 0)
-            k++;
-    }
-
-    thrust::device_vector<   unsigned int   > d_keys = h_keys;
-    thrust::device_vector< FixedVector<T,N> > d_vals = h_vals;
-    thrust::device_vector< FixedVector<T,N> > d_output(n);
-    
-    thrust::inclusive_scan_by_key(h_keys.begin(), h_keys.end(), h_vals.begin(), h_output.begin());
-    thrust::inclusive_scan_by_key(d_keys.begin(), d_keys.end(), d_vals.begin(), d_output.begin());
-
-    ASSERT_EQUAL_QUIET(h_output, d_output);
-    
-    thrust::exclusive_scan_by_key(h_keys.begin(), h_keys.end(), h_vals.begin(), h_output.begin(), FixedVector<T,N>(0));
-    thrust::exclusive_scan_by_key(d_keys.begin(), d_keys.end(), d_vals.begin(), d_output.begin(), FixedVector<T,N>(0));
-    
-    ASSERT_EQUAL_QUIET(h_output, d_output);
-}
-
-void TestScanByKeyWithLargeTypes(void)
-{
-    _TestScanByKeyWithLargeTypes<int,    1>();
-    _TestScanByKeyWithLargeTypes<int,    2>();
-    _TestScanByKeyWithLargeTypes<int,    4>();
-    _TestScanByKeyWithLargeTypes<int,    8>();
-    //_TestScanByKeyWithLargeTypes<int,   16>();  // too many resources requested for launch
-    //_TestScanByKeyWithLargeTypes<int,   32>();  
-    //_TestScanByKeyWithLargeTypes<int,   64>();  // too large to pass as argument
-    //_TestScanByKeyWithLargeTypes<int,  128>();
-    //_TestScanByKeyWithLargeTypes<int,  256>();
-    //_TestScanByKeyWithLargeTypes<int,  512>();
-    //_TestScanByKeyWithLargeTypes<int, 1024>();
-}
-DECLARE_UNITTEST(TestScanByKeyWithLargeTypes);
-
diff --git a/testing/scan_by_key.exclusive.cu b/testing/scan_by_key.exclusive.cu
new file mode 100644
index 000000000..58354d848
--- /dev/null
+++ b/testing/scan_by_key.exclusive.cu
@@ -0,0 +1,576 @@
+#include <thrust/scan.h>
+
+#include <thrust/functional.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/iterator/retag.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/random.h>
+
+#include <unittest/unittest.h>
+
+
+template <typename Vector>
+void TestExclusiveScanByKeySimple()
+{
+  typedef typename Vector::value_type T;
+  typedef typename Vector::iterator Iterator;
+
+  Vector keys(7);
+  Vector vals(7);
+
+  Vector output(7, 0);
+
+  // clang-format off
+  keys[0] = 0; vals[0] = 1;
+  keys[1] = 1; vals[1] = 2;
+  keys[2] = 1; vals[2] = 3;
+  keys[3] = 1; vals[3] = 4;
+  keys[4] = 2; vals[4] = 5;
+  keys[5] = 3; vals[5] = 6;
+  keys[6] = 3; vals[6] = 7;
+  // clang-format on
+
+  Iterator iter = thrust::exclusive_scan_by_key(keys.begin(),
+                                                keys.end(),
+                                                vals.begin(),
+                                                output.begin());
+
+  ASSERT_EQUAL_QUIET(iter, output.end());
+
+  ASSERT_EQUAL(output[0], 0);
+  ASSERT_EQUAL(output[1], 0);
+  ASSERT_EQUAL(output[2], 2);
+  ASSERT_EQUAL(output[3], 5);
+  ASSERT_EQUAL(output[4], 0);
+  ASSERT_EQUAL(output[5], 0);
+  ASSERT_EQUAL(output[6], 6);
+
+  thrust::exclusive_scan_by_key(keys.begin(),
+                                keys.end(),
+                                vals.begin(),
+                                output.begin(),
+                                T(10));
+
+  ASSERT_EQUAL(output[0], 10);
+  ASSERT_EQUAL(output[1], 10);
+  ASSERT_EQUAL(output[2], 12);
+  ASSERT_EQUAL(output[3], 15);
+  ASSERT_EQUAL(output[4], 10);
+  ASSERT_EQUAL(output[5], 10);
+  ASSERT_EQUAL(output[6], 16);
+
+  thrust::exclusive_scan_by_key(keys.begin(),
+                                keys.end(),
+                                vals.begin(),
+                                output.begin(),
+                                T(10),
+                                thrust::equal_to<T>(),
+                                thrust::multiplies<T>());
+
+  ASSERT_EQUAL(output[0], 10);
+  ASSERT_EQUAL(output[1], 10);
+  ASSERT_EQUAL(output[2], 20);
+  ASSERT_EQUAL(output[3], 60);
+  ASSERT_EQUAL(output[4], 10);
+  ASSERT_EQUAL(output[5], 10);
+  ASSERT_EQUAL(output[6], 60);
+
+  thrust::exclusive_scan_by_key(keys.begin(),
+                                keys.end(),
+                                vals.begin(),
+                                output.begin(),
+                                T(10),
+                                thrust::equal_to<T>());
+
+  ASSERT_EQUAL(output[0], 10);
+  ASSERT_EQUAL(output[1], 10);
+  ASSERT_EQUAL(output[2], 12);
+  ASSERT_EQUAL(output[3], 15);
+  ASSERT_EQUAL(output[4], 10);
+  ASSERT_EQUAL(output[5], 10);
+  ASSERT_EQUAL(output[6], 16);
+}
+DECLARE_VECTOR_UNITTEST(TestExclusiveScanByKeySimple);
+
+
+template <typename InputIterator1,
+          typename InputIterator2,
+          typename OutputIterator>
+OutputIterator exclusive_scan_by_key(my_system& system,
+                                     InputIterator1,
+                                     InputIterator1,
+                                     InputIterator2,
+                                     OutputIterator result)
+{
+  system.validate_dispatch();
+  return result;
+}
+
+
+void TestExclusiveScanByKeyDispatchExplicit()
+{
+  thrust::device_vector<int> vec(1);
+
+  my_system sys(0);
+  thrust::exclusive_scan_by_key(sys,
+                                vec.begin(),
+                                vec.begin(),
+                                vec.begin(),
+                                vec.begin());
+
+  ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestExclusiveScanByKeyDispatchExplicit);
+
+
+template <typename InputIterator1,
+          typename InputIterator2,
+          typename OutputIterator>
+OutputIterator exclusive_scan_by_key(my_tag,
+                                     InputIterator1,
+                                     InputIterator1,
+                                     InputIterator2,
+                                     OutputIterator result)
+{
+  *result = 13;
+  return result;
+}
+
+
+void TestExclusiveScanByKeyDispatchImplicit()
+{
+  thrust::device_vector<int> vec(1);
+
+  thrust::exclusive_scan_by_key(thrust::retag<my_tag>(vec.begin()),
+                                thrust::retag<my_tag>(vec.begin()),
+                                thrust::retag<my_tag>(vec.begin()),
+                                thrust::retag<my_tag>(vec.begin()));
+
+  ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestExclusiveScanByKeyDispatchImplicit);
+
+
+struct head_flag_predicate
+{
+  template <typename T>
+  __host__ __device__ bool operator()(const T&, const T& b)
+  {
+    return b ? false : true;
+  }
+};
+
+
+template <typename Vector>
+void TestScanByKeyHeadFlags()
+{
+  typedef typename Vector::value_type T;
+
+  Vector keys(7);
+  Vector vals(7);
+
+  Vector output(7, 0);
+
+  // clang-format off
+  keys[0] = 0; vals[0] = 1;
+  keys[1] = 1; vals[1] = 2;
+  keys[2] = 0; vals[2] = 3;
+  keys[3] = 0; vals[3] = 4;
+  keys[4] = 1; vals[4] = 5;
+  keys[5] = 1; vals[5] = 6;
+  keys[6] = 0; vals[6] = 7;
+  // clang-format on
+
+  thrust::exclusive_scan_by_key(keys.begin(),
+                                keys.end(),
+                                vals.begin(),
+                                output.begin(),
+                                T(10),
+                                head_flag_predicate(),
+                                thrust::plus<T>());
+
+  ASSERT_EQUAL(output[0], 10);
+  ASSERT_EQUAL(output[1], 10);
+  ASSERT_EQUAL(output[2], 12);
+  ASSERT_EQUAL(output[3], 15);
+  ASSERT_EQUAL(output[4], 10);
+  ASSERT_EQUAL(output[5], 10);
+  ASSERT_EQUAL(output[6], 16);
+}
+DECLARE_VECTOR_UNITTEST(TestScanByKeyHeadFlags);
+
+
+template <typename Vector>
+void TestScanByKeyReusedKeys()
+{
+  Vector keys(7);
+  Vector vals(7);
+
+  Vector output(7, 0);
+
+  // clang-format off
+  keys[0] = 0; vals[0] = 1;
+  keys[1] = 1; vals[1] = 2;
+  keys[2] = 1; vals[2] = 3;
+  keys[3] = 1; vals[3] = 4;
+  keys[4] = 0; vals[4] = 5;
+  keys[5] = 1; vals[5] = 6;
+  keys[6] = 1; vals[6] = 7;
+  // clang-format on
+
+  thrust::exclusive_scan_by_key(keys.begin(),
+                                keys.end(),
+                                vals.begin(),
+                                output.begin(),
+                                typename Vector::value_type(10));
+
+  ASSERT_EQUAL(output[0], 10);
+  ASSERT_EQUAL(output[1], 10);
+  ASSERT_EQUAL(output[2], 12);
+  ASSERT_EQUAL(output[3], 15);
+  ASSERT_EQUAL(output[4], 10);
+  ASSERT_EQUAL(output[5], 10);
+  ASSERT_EQUAL(output[6], 16);
+}
+DECLARE_VECTOR_UNITTEST(TestScanByKeyReusedKeys);
+
+
+template <typename T>
+void TestExclusiveScanByKey(const size_t n)
+{
+  thrust::host_vector<int> h_keys(n);
+  thrust::default_random_engine rng;
+  for (size_t i = 0, k = 0; i < n; i++)
+  {
+    h_keys[i] = static_cast<int>(k);
+    if (rng() % 10 == 0)
+    {
+      k++;
+    }
+  }
+  thrust::device_vector<int> d_keys = h_keys;
+
+  thrust::host_vector<T> h_vals = unittest::random_integers<int>(n);
+  for (size_t i = 0; i < n; i++)
+  {
+    h_vals[i] = static_cast<int>(i % 10);
+  }
+  thrust::device_vector<T> d_vals = h_vals;
+
+  thrust::host_vector<T> h_output(n);
+  thrust::device_vector<T> d_output(n);
+
+  // without init
+  thrust::exclusive_scan_by_key(h_keys.begin(),
+                                h_keys.end(),
+                                h_vals.begin(),
+                                h_output.begin());
+  thrust::exclusive_scan_by_key(d_keys.begin(),
+                                d_keys.end(),
+                                d_vals.begin(),
+                                d_output.begin());
+  ASSERT_EQUAL(d_output, h_output);
+
+  // with init
+  thrust::exclusive_scan_by_key(h_keys.begin(),
+                                h_keys.end(),
+                                h_vals.begin(),
+                                h_output.begin(),
+                                (T)11);
+  thrust::exclusive_scan_by_key(d_keys.begin(),
+                                d_keys.end(),
+                                d_vals.begin(),
+                                d_output.begin(),
+                                (T)11);
+  ASSERT_EQUAL(d_output, h_output);
+}
+DECLARE_VARIABLE_UNITTEST(TestExclusiveScanByKey);
+
+
+template <typename T>
+void TestExclusiveScanByKeyInPlace(const size_t n)
+{
+  thrust::host_vector<int> h_keys(n);
+  thrust::default_random_engine rng;
+  for (size_t i = 0, k = 0; i < n; i++)
+  {
+    h_keys[i] = static_cast<int>(k);
+    if (rng() % 10 == 0)
+    {
+      k++;
+    }
+  }
+  thrust::device_vector<int> d_keys = h_keys;
+
+  thrust::host_vector<T> h_vals = unittest::random_integers<int>(n);
+  for (size_t i = 0; i < n; i++)
+  {
+    h_vals[i] = static_cast<int>(i % 10);
+  }
+  thrust::device_vector<T> d_vals = h_vals;
+
+  // in-place scans: in/out values aliasing
+  thrust::host_vector<T> h_output   = h_vals;
+  thrust::device_vector<T> d_output = d_vals;
+  thrust::exclusive_scan_by_key(h_keys.begin(),
+                                h_keys.end(),
+                                h_output.begin(),
+                                h_output.begin(),
+                                (T)11);
+  thrust::exclusive_scan_by_key(d_keys.begin(),
+                                d_keys.end(),
+                                d_output.begin(),
+                                d_output.begin(),
+                                (T)11);
+  ASSERT_EQUAL(d_output, h_output);
+
+  // in-place scans: in/out keys aliasing
+  thrust::exclusive_scan_by_key(h_keys.begin(),
+                                h_keys.end(),
+                                h_vals.begin(),
+                                h_keys.begin(),
+                                (T)11);
+  thrust::exclusive_scan_by_key(d_keys.begin(),
+                                d_keys.end(),
+                                d_vals.begin(),
+                                d_keys.begin(),
+                                (T)11);
+  ASSERT_EQUAL(d_keys, h_keys);
+}
+DECLARE_VARIABLE_UNITTEST(TestExclusiveScanByKeyInPlace);
+
+
+void TestScanByKeyMixedTypes()
+{
+  const unsigned int n = 113;
+
+  thrust::host_vector<int> h_keys(n);
+  thrust::default_random_engine rng;
+  for (size_t i = 0, k = 0; i < n; i++)
+  {
+    h_keys[i] = static_cast<int>(k);
+    if (rng() % 10 == 0)
+    {
+      k++;
+    }
+  }
+  thrust::device_vector<int> d_keys = h_keys;
+
+  thrust::host_vector<unsigned int> h_vals =
+    unittest::random_integers<unsigned int>(n);
+  for (size_t i = 0; i < n; i++)
+    h_vals[i] %= 10;
+  thrust::device_vector<unsigned int> d_vals = h_vals;
+
+  thrust::host_vector<float> h_float_output(n);
+  thrust::device_vector<float> d_float_output(n);
+  thrust::host_vector<int> h_int_output(n);
+  thrust::device_vector<int> d_int_output(n);
+
+  // mixed vals/output types
+  thrust::exclusive_scan_by_key(h_keys.begin(),
+                                h_keys.end(),
+                                h_vals.begin(),
+                                h_float_output.begin(),
+                                (float)3.5);
+  thrust::exclusive_scan_by_key(d_keys.begin(),
+                                d_keys.end(),
+                                d_vals.begin(),
+                                d_float_output.begin(),
+                                (float)3.5);
+  ASSERT_EQUAL(d_float_output, h_float_output);
+
+  thrust::exclusive_scan_by_key(h_keys.begin(),
+                                h_keys.end(),
+                                h_vals.begin(),
+                                h_float_output.begin(),
+                                (int)3);
+  thrust::exclusive_scan_by_key(d_keys.begin(),
+                                d_keys.end(),
+                                d_vals.begin(),
+                                d_float_output.begin(),
+                                (int)3);
+  ASSERT_EQUAL(d_float_output, h_float_output);
+
+  thrust::exclusive_scan_by_key(h_keys.begin(),
+                                h_keys.end(),
+                                h_vals.begin(),
+                                h_int_output.begin(),
+                                (int)3);
+  thrust::exclusive_scan_by_key(d_keys.begin(),
+                                d_keys.end(),
+                                d_vals.begin(),
+                                d_int_output.begin(),
+                                (int)3);
+  ASSERT_EQUAL(d_int_output, h_int_output);
+
+  thrust::exclusive_scan_by_key(h_keys.begin(),
+                                h_keys.end(),
+                                h_vals.begin(),
+                                h_int_output.begin(),
+                                (float)3.5);
+  thrust::exclusive_scan_by_key(d_keys.begin(),
+                                d_keys.end(),
+                                d_vals.begin(),
+                                d_int_output.begin(),
+                                (float)3.5);
+  ASSERT_EQUAL(d_int_output, h_int_output);
+}
+DECLARE_UNITTEST(TestScanByKeyMixedTypes);
+
+
+template <typename T>
+void TestScanByKeyDiscardOutput(std::size_t n)
+{
+  thrust::host_vector<T> h_keys(n);
+  thrust::default_random_engine rng;
+
+  for (size_t i = 0, k = 0; i < n; i++)
+  {
+    h_keys[i] = static_cast<T>(k);
+    if (rng() % 10 == 0)
+    {
+      k++;
+    }
+  }
+  thrust::device_vector<T> d_keys = h_keys;
+
+  thrust::host_vector<T> h_vals(n);
+  for (size_t i = 0; i < n; i++)
+  {
+    h_vals[i] = static_cast<T>(i % 10);
+  }
+  thrust::device_vector<T> d_vals = h_vals;
+
+  auto out = thrust::make_discard_iterator();
+
+  // These are no-ops, but they should compile.
+  thrust::exclusive_scan_by_key(d_keys.cbegin(),
+                                d_keys.cend(),
+                                d_vals.cbegin(),
+                                out);
+  thrust::exclusive_scan_by_key(d_keys.cbegin(),
+                                d_keys.cend(),
+                                d_vals.cbegin(),
+                                out,
+                                T{});
+  thrust::exclusive_scan_by_key(d_keys.cbegin(),
+                                d_keys.cend(),
+                                d_vals.cbegin(),
+                                out,
+                                T{},
+                                thrust::equal_to<T>{});
+  thrust::exclusive_scan_by_key(d_keys.cbegin(),
+                                d_keys.cend(),
+                                d_vals.cbegin(),
+                                out,
+                                T{},
+                                thrust::equal_to<T>{},
+                                thrust::multiplies<T>{});
+}
+DECLARE_VARIABLE_UNITTEST(TestScanByKeyDiscardOutput);
+
+
+void TestScanByKeyLargeInput()
+{
+  const unsigned int N = 1 << 20;
+
+  thrust::host_vector<unsigned int> vals_sizes =
+    unittest::random_integers<unsigned int>(10);
+
+  thrust::host_vector<unsigned int> h_vals =
+    unittest::random_integers<unsigned int>(N);
+  thrust::device_vector<unsigned int> d_vals = h_vals;
+
+  thrust::host_vector<unsigned int> h_output(N, 0);
+  thrust::device_vector<unsigned int> d_output(N, 0);
+
+  for (unsigned int i = 0; i < vals_sizes.size(); i++)
+  {
+    const unsigned int n = vals_sizes[i] % N;
+
+    // define segments
+    thrust::host_vector<unsigned int> h_keys(n);
+    thrust::default_random_engine rng;
+    for (size_t j = 0, k = 0; j < n; j++)
+    {
+      h_keys[j] = static_cast<unsigned int>(k);
+      if (rng() % 100 == 0)
+      {
+        k++;
+      }
+    }
+    thrust::device_vector<unsigned int> d_keys = h_keys;
+
+    thrust::exclusive_scan_by_key(h_keys.begin(),
+                                  h_keys.begin() + n,
+                                  h_vals.begin(),
+                                  h_output.begin());
+    thrust::exclusive_scan_by_key(d_keys.begin(),
+                                  d_keys.begin() + n,
+                                  d_vals.begin(),
+                                  d_output.begin());
+    ASSERT_EQUAL(d_output, h_output);
+  }
+}
+DECLARE_UNITTEST(TestScanByKeyLargeInput);
+
+
+template <typename T, unsigned int N>
+void _TestScanByKeyWithLargeTypes()
+{
+  size_t n = (64 * 1024) / sizeof(FixedVector<T, N>);
+
+  thrust::host_vector<unsigned int> h_keys(n);
+  thrust::host_vector<FixedVector<T, N>> h_vals(n);
+  thrust::host_vector<FixedVector<T, N>> h_output(n);
+
+  thrust::default_random_engine rng;
+  for (size_t i = 0, k = 0; i < h_vals.size(); i++)
+  {
+    h_keys[i] = static_cast<unsigned int>(k);
+    h_vals[i] = FixedVector<T, N>(static_cast<T>(i));
+    if (rng() % 5 == 0)
+    {
+      k++;
+    }
+  }
+
+  thrust::device_vector<unsigned int> d_keys      = h_keys;
+  thrust::device_vector<FixedVector<T, N>> d_vals = h_vals;
+  thrust::device_vector<FixedVector<T, N>> d_output(n);
+
+  thrust::exclusive_scan_by_key(h_keys.begin(),
+                                h_keys.end(),
+                                h_vals.begin(),
+                                h_output.begin(),
+                                FixedVector<T, N>(0));
+  thrust::exclusive_scan_by_key(d_keys.begin(),
+                                d_keys.end(),
+                                d_vals.begin(),
+                                d_output.begin(),
+                                FixedVector<T, N>(0));
+
+  ASSERT_EQUAL_QUIET(h_output, d_output);
+}
+
+
+void TestScanByKeyWithLargeTypes()
+{
+  _TestScanByKeyWithLargeTypes<int, 1>();
+  _TestScanByKeyWithLargeTypes<int, 2>();
+  _TestScanByKeyWithLargeTypes<int, 4>();
+  _TestScanByKeyWithLargeTypes<int, 8>();
+
+  // too many resources requested for launch:
+  //_TestScanByKeyWithLargeTypes<int,   16>();
+  //_TestScanByKeyWithLargeTypes<int,   32>();
+
+  // too large to pass as argument:
+  //_TestScanByKeyWithLargeTypes<int,   64>();
+  //_TestScanByKeyWithLargeTypes<int,  128>();
+  //_TestScanByKeyWithLargeTypes<int,  256>();
+  //_TestScanByKeyWithLargeTypes<int,  512>();
+  //_TestScanByKeyWithLargeTypes<int, 1024>();
+}
+DECLARE_UNITTEST(TestScanByKeyWithLargeTypes);
diff --git a/testing/scan_by_key.inclusive.cu b/testing/scan_by_key.inclusive.cu
new file mode 100644
index 000000000..b2d2337e2
--- /dev/null
+++ b/testing/scan_by_key.inclusive.cu
@@ -0,0 +1,524 @@
+#include <thrust/scan.h>
+
+#include <thrust/functional.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/iterator/retag.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/random.h>
+
+#include <unittest/unittest.h>
+
+template <typename Vector>
+void TestInclusiveScanByKeySimple()
+{
+  typedef typename Vector::value_type T;
+  typedef typename Vector::iterator Iterator;
+
+  Vector keys(7);
+  Vector vals(7);
+
+  Vector output(7, 0);
+
+  // clang-format off
+  keys[0] = 0; vals[0] = 1;
+  keys[1] = 1; vals[1] = 2;
+  keys[2] = 1; vals[2] = 3;
+  keys[3] = 1; vals[3] = 4;
+  keys[4] = 2; vals[4] = 5;
+  keys[5] = 3; vals[5] = 6;
+  keys[6] = 3; vals[6] = 7;
+  // clang-format on
+
+  Iterator iter = thrust::inclusive_scan_by_key(keys.begin(),
+                                                keys.end(),
+                                                vals.begin(),
+                                                output.begin());
+
+  ASSERT_EQUAL_QUIET(iter, output.end());
+
+  ASSERT_EQUAL(output[0], 1);
+  ASSERT_EQUAL(output[1], 2);
+  ASSERT_EQUAL(output[2], 5);
+  ASSERT_EQUAL(output[3], 9);
+  ASSERT_EQUAL(output[4], 5);
+  ASSERT_EQUAL(output[5], 6);
+  ASSERT_EQUAL(output[6], 13);
+
+  thrust::inclusive_scan_by_key(keys.begin(),
+                                keys.end(),
+                                vals.begin(),
+                                output.begin(),
+                                thrust::equal_to<T>(),
+                                thrust::multiplies<T>());
+
+  ASSERT_EQUAL(output[0], 1);
+  ASSERT_EQUAL(output[1], 2);
+  ASSERT_EQUAL(output[2], 6);
+  ASSERT_EQUAL(output[3], 24);
+  ASSERT_EQUAL(output[4], 5);
+  ASSERT_EQUAL(output[5], 6);
+  ASSERT_EQUAL(output[6], 42);
+
+  thrust::inclusive_scan_by_key(keys.begin(),
+                                keys.end(),
+                                vals.begin(),
+                                output.begin(),
+                                thrust::equal_to<T>());
+
+  ASSERT_EQUAL(output[0], 1);
+  ASSERT_EQUAL(output[1], 2);
+  ASSERT_EQUAL(output[2], 5);
+  ASSERT_EQUAL(output[3], 9);
+  ASSERT_EQUAL(output[4], 5);
+  ASSERT_EQUAL(output[5], 6);
+  ASSERT_EQUAL(output[6], 13);
+}
+DECLARE_VECTOR_UNITTEST(TestInclusiveScanByKeySimple);
+
+
+template <typename InputIterator1,
+          typename InputIterator2,
+          typename OutputIterator>
+OutputIterator inclusive_scan_by_key(my_system& system,
+                                     InputIterator1,
+                                     InputIterator1,
+                                     InputIterator2,
+                                     OutputIterator result)
+{
+  system.validate_dispatch();
+  return result;
+}
+
+void TestInclusiveScanByKeyDispatchExplicit()
+{
+  thrust::device_vector<int> vec(1);
+
+  my_system sys(0);
+  thrust::inclusive_scan_by_key(sys,
+                                vec.begin(),
+                                vec.begin(),
+                                vec.begin(),
+                                vec.begin());
+
+  ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestInclusiveScanByKeyDispatchExplicit);
+
+
+template <typename InputIterator1,
+          typename InputIterator2,
+          typename OutputIterator>
+OutputIterator inclusive_scan_by_key(my_tag,
+                                     InputIterator1,
+                                     InputIterator1,
+                                     InputIterator2,
+                                     OutputIterator result)
+{
+  *result = 13;
+  return result;
+}
+
+void TestInclusiveScanByKeyDispatchImplicit()
+{
+  thrust::device_vector<int> vec(1);
+
+  thrust::inclusive_scan_by_key(thrust::retag<my_tag>(vec.begin()),
+                                thrust::retag<my_tag>(vec.begin()),
+                                thrust::retag<my_tag>(vec.begin()),
+                                thrust::retag<my_tag>(vec.begin()));
+
+  ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestInclusiveScanByKeyDispatchImplicit);
+
+struct head_flag_predicate
+{
+  template <typename T>
+  __host__ __device__ bool operator()(const T&, const T& b)
+  {
+    return b ? false : true;
+  }
+};
+
+template <typename Vector>
+void TestScanByKeyHeadFlags()
+{
+  typedef typename Vector::value_type T;
+
+  Vector keys(7);
+  Vector vals(7);
+
+  Vector output(7, 0);
+
+  // clang-format off
+  keys[0] = 0; vals[0] = 1;
+  keys[1] = 1; vals[1] = 2;
+  keys[2] = 0; vals[2] = 3;
+  keys[3] = 0; vals[3] = 4;
+  keys[4] = 1; vals[4] = 5;
+  keys[5] = 1; vals[5] = 6;
+  keys[6] = 0; vals[6] = 7;
+  // clang-format on
+
+  thrust::inclusive_scan_by_key(keys.begin(),
+                                keys.end(),
+                                vals.begin(),
+                                output.begin(),
+                                head_flag_predicate(),
+                                thrust::plus<T>());
+
+  ASSERT_EQUAL(output[0], 1);
+  ASSERT_EQUAL(output[1], 2);
+  ASSERT_EQUAL(output[2], 5);
+  ASSERT_EQUAL(output[3], 9);
+  ASSERT_EQUAL(output[4], 5);
+  ASSERT_EQUAL(output[5], 6);
+  ASSERT_EQUAL(output[6], 13);
+}
+DECLARE_VECTOR_UNITTEST(TestScanByKeyHeadFlags);
+
+template <typename Vector>
+void TestInclusiveScanByKeyTransformIterator()
+{
+  typedef typename Vector::value_type T;
+
+  Vector keys(7);
+  Vector vals(7);
+
+  Vector output(7, 0);
+
+  // clang-format off
+  keys[0] = 0; vals[0] = 1;
+  keys[1] = 1; vals[1] = 2;
+  keys[2] = 1; vals[2] = 3;
+  keys[3] = 1; vals[3] = 4;
+  keys[4] = 2; vals[4] = 5;
+  keys[5] = 3; vals[5] = 6;
+  keys[6] = 3; vals[6] = 7;
+  // clang-format on
+
+  thrust::inclusive_scan_by_key(
+    keys.begin(),
+    keys.end(),
+    thrust::make_transform_iterator(vals.begin(), thrust::negate<T>()),
+    output.begin());
+
+  ASSERT_EQUAL(output[0], -1);
+  ASSERT_EQUAL(output[1], -2);
+  ASSERT_EQUAL(output[2], -5);
+  ASSERT_EQUAL(output[3], -9);
+  ASSERT_EQUAL(output[4], -5);
+  ASSERT_EQUAL(output[5], -6);
+  ASSERT_EQUAL(output[6], -13);
+}
+DECLARE_VECTOR_UNITTEST(TestInclusiveScanByKeyTransformIterator);
+
+
+template <typename Vector>
+void TestScanByKeyReusedKeys()
+{
+  Vector keys(7);
+  Vector vals(7);
+
+  Vector output(7, 0);
+
+  // clang-format off
+  keys[0] = 0; vals[0] = 1;
+  keys[1] = 1; vals[1] = 2;
+  keys[2] = 1; vals[2] = 3;
+  keys[3] = 1; vals[3] = 4;
+  keys[4] = 0; vals[4] = 5;
+  keys[5] = 1; vals[5] = 6;
+  keys[6] = 1; vals[6] = 7;
+  // clang-format on
+
+  thrust::inclusive_scan_by_key(keys.begin(),
+                                keys.end(),
+                                vals.begin(),
+                                output.begin());
+
+  ASSERT_EQUAL(output[0], 1);
+  ASSERT_EQUAL(output[1], 2);
+  ASSERT_EQUAL(output[2], 5);
+  ASSERT_EQUAL(output[3], 9);
+  ASSERT_EQUAL(output[4], 5);
+  ASSERT_EQUAL(output[5], 6);
+  ASSERT_EQUAL(output[6], 13);
+}
+DECLARE_VECTOR_UNITTEST(TestScanByKeyReusedKeys);
+
+
+template <typename T>
+void TestInclusiveScanByKey(const size_t n)
+{
+  thrust::host_vector<int> h_keys(n);
+  thrust::default_random_engine rng;
+  for (size_t i = 0, k = 0; i < n; i++)
+  {
+    h_keys[i] = static_cast<int>(k);
+    if (rng() % 10 == 0)
+    {
+      k++;
+    }
+  }
+  thrust::device_vector<int> d_keys = h_keys;
+
+  thrust::host_vector<T> h_vals = unittest::random_integers<int>(n);
+  for (size_t i = 0; i < n; i++)
+    h_vals[i] = static_cast<int>(i % 10);
+  thrust::device_vector<T> d_vals = h_vals;
+
+  thrust::host_vector<T> h_output(n);
+  thrust::device_vector<T> d_output(n);
+
+  thrust::inclusive_scan_by_key(h_keys.begin(),
+                                h_keys.end(),
+                                h_vals.begin(),
+                                h_output.begin());
+  thrust::inclusive_scan_by_key(d_keys.begin(),
+                                d_keys.end(),
+                                d_vals.begin(),
+                                d_output.begin());
+  ASSERT_EQUAL(d_output, h_output);
+}
+DECLARE_VARIABLE_UNITTEST(TestInclusiveScanByKey);
+
+
+template <typename T>
+void TestInclusiveScanByKeyInPlace(const size_t n)
+{
+  thrust::host_vector<int> h_keys(n);
+  thrust::default_random_engine rng;
+  for (size_t i = 0, k = 0; i < n; i++)
+  {
+    h_keys[i] = static_cast<int>(k);
+    if (rng() % 10 == 0)
+    {
+      k++;
+    }
+  }
+  thrust::device_vector<int> d_keys = h_keys;
+
+  thrust::host_vector<T> h_vals = unittest::random_integers<int>(n);
+  for (size_t i = 0; i < n; i++)
+  {
+    h_vals[i] = static_cast<int>(i % 10);
+  }
+  thrust::device_vector<T> d_vals = h_vals;
+
+  thrust::host_vector<T> h_output(n);
+  thrust::device_vector<T> d_output(n);
+
+  // in-place scans: in/out values aliasing
+  h_output = h_vals;
+  d_output = d_vals;
+  thrust::inclusive_scan_by_key(h_keys.begin(),
+                                h_keys.end(),
+                                h_output.begin(),
+                                h_output.begin());
+  thrust::inclusive_scan_by_key(d_keys.begin(),
+                                d_keys.end(),
+                                d_output.begin(),
+                                d_output.begin());
+  ASSERT_EQUAL(d_output, h_output);
+
+  // in-place scans: in/out keys aliasing
+  thrust::inclusive_scan_by_key(h_keys.begin(),
+                                h_keys.end(),
+                                h_vals.begin(),
+                                h_keys.begin());
+  thrust::inclusive_scan_by_key(d_keys.begin(),
+                                d_keys.end(),
+                                d_vals.begin(),
+                                d_keys.begin());
+  ASSERT_EQUAL(d_keys, h_keys);
+}
+DECLARE_VARIABLE_UNITTEST(TestInclusiveScanByKeyInPlace);
+
+
+void TestScanByKeyMixedTypes()
+{
+  const unsigned int n = 113;
+
+  thrust::host_vector<int> h_keys(n);
+  thrust::default_random_engine rng;
+  for (size_t i = 0, k = 0; i < n; i++)
+  {
+    h_keys[i] = static_cast<int>(k);
+    if (rng() % 10 == 0)
+    {
+      k++;
+    }
+  }
+  thrust::device_vector<int> d_keys = h_keys;
+
+  thrust::host_vector<unsigned int> h_vals =
+    unittest::random_integers<unsigned int>(n);
+  for (size_t i = 0; i < n; i++)
+    h_vals[i] %= 10;
+  thrust::device_vector<unsigned int> d_vals = h_vals;
+
+  thrust::host_vector<float> h_float_output(n);
+  thrust::device_vector<float> d_float_output(n);
+  thrust::host_vector<int> h_int_output(n);
+  thrust::device_vector<int> d_int_output(n);
+
+  // mixed vals/output types
+  thrust::inclusive_scan_by_key(h_keys.begin(),
+                                h_keys.end(),
+                                h_vals.begin(),
+                                h_float_output.begin());
+  thrust::inclusive_scan_by_key(d_keys.begin(),
+                                d_keys.end(),
+                                d_vals.begin(),
+                                d_float_output.begin());
+  ASSERT_EQUAL(d_float_output, h_float_output);
+}
+DECLARE_UNITTEST(TestScanByKeyMixedTypes);
+
+
+template <typename T>
+void TestScanByKeyDiscardOutput(std::size_t n)
+{
+  thrust::host_vector<T> h_keys(n);
+  thrust::default_random_engine rng;
+
+  for (size_t i = 0, k = 0; i < n; i++)
+  {
+    h_keys[i] = static_cast<T>(k);
+    if (rng() % 10 == 0)
+    {
+      k++;
+    }
+  }
+  thrust::device_vector<T> d_keys = h_keys;
+
+  thrust::host_vector<T> h_vals(n);
+  for (size_t i = 0; i < n; i++)
+  {
+    h_vals[i] = static_cast<T>(i % 10);
+  }
+  thrust::device_vector<T> d_vals = h_vals;
+
+  auto out = thrust::make_discard_iterator();
+
+  // These are no-ops, but they should compile.
+  thrust::inclusive_scan_by_key(d_keys.cbegin(),
+                                d_keys.cend(),
+                                d_vals.cbegin(),
+                                out);
+  thrust::inclusive_scan_by_key(d_keys.cbegin(),
+                                d_keys.cend(),
+                                d_vals.cbegin(),
+                                out,
+                                thrust::equal_to<T>{});
+  thrust::inclusive_scan_by_key(d_keys.cbegin(),
+                                d_keys.cend(),
+                                d_vals.cbegin(),
+                                out,
+                                thrust::equal_to<T>{},
+                                thrust::multiplies<T>{});
+}
+DECLARE_VARIABLE_UNITTEST(TestScanByKeyDiscardOutput);
+
+
+void TestScanByKeyLargeInput()
+{
+  const unsigned int N = 1 << 20;
+
+  thrust::host_vector<unsigned int> vals_sizes =
+    unittest::random_integers<unsigned int>(10);
+
+  thrust::host_vector<unsigned int> h_vals =
+    unittest::random_integers<unsigned int>(N);
+  thrust::device_vector<unsigned int> d_vals = h_vals;
+
+  thrust::host_vector<unsigned int> h_output(N, 0);
+  thrust::device_vector<unsigned int> d_output(N, 0);
+
+  for (unsigned int i = 0; i < vals_sizes.size(); i++)
+  {
+    const unsigned int n = vals_sizes[i] % N;
+
+    // define segments
+    thrust::host_vector<unsigned int> h_keys(n);
+    thrust::default_random_engine rng;
+    for (size_t j = 0, k = 0; j < n; j++)
+    {
+      h_keys[j] = static_cast<unsigned int>(k);
+      if (rng() % 100 == 0)
+      {
+        k++;
+      }
+    }
+    thrust::device_vector<unsigned int> d_keys = h_keys;
+
+    thrust::inclusive_scan_by_key(h_keys.begin(),
+                                  h_keys.begin() + n,
+                                  h_vals.begin(),
+                                  h_output.begin());
+    thrust::inclusive_scan_by_key(d_keys.begin(),
+                                  d_keys.begin() + n,
+                                  d_vals.begin(),
+                                  d_output.begin());
+    ASSERT_EQUAL(d_output, h_output);
+  }
+}
+DECLARE_UNITTEST(TestScanByKeyLargeInput);
+
+
+template <typename T, unsigned int N>
+void _TestScanByKeyWithLargeTypes()
+{
+  size_t n = (64 * 1024) / sizeof(FixedVector<T, N>);
+
+  thrust::host_vector<unsigned int> h_keys(n);
+  thrust::host_vector<FixedVector<T, N>> h_vals(n);
+  thrust::host_vector<FixedVector<T, N>> h_output(n);
+
+  thrust::default_random_engine rng;
+  for (size_t i = 0, k = 0; i < h_vals.size(); i++)
+  {
+    h_keys[i] = static_cast<unsigned int>(k);
+    h_vals[i] = FixedVector<T, N>(static_cast<T>(i));
+    if (rng() % 5 == 0)
+    {
+      k++;
+    }
+  }
+
+  thrust::device_vector<unsigned int> d_keys      = h_keys;
+  thrust::device_vector<FixedVector<T, N>> d_vals = h_vals;
+  thrust::device_vector<FixedVector<T, N>> d_output(n);
+
+  thrust::inclusive_scan_by_key(h_keys.begin(),
+                                h_keys.end(),
+                                h_vals.begin(),
+                                h_output.begin());
+  thrust::inclusive_scan_by_key(d_keys.begin(),
+                                d_keys.end(),
+                                d_vals.begin(),
+                                d_output.begin());
+
+  ASSERT_EQUAL_QUIET(h_output, d_output);
+}
+
+void TestScanByKeyWithLargeTypes()
+{
+  _TestScanByKeyWithLargeTypes<int, 1>();
+  _TestScanByKeyWithLargeTypes<int, 2>();
+  _TestScanByKeyWithLargeTypes<int, 4>();
+  _TestScanByKeyWithLargeTypes<int, 8>();
+
+  // too many resources requested for launch:
+  //_TestScanByKeyWithLargeTypes<int,   16>();
+  //_TestScanByKeyWithLargeTypes<int,   32>();
+
+  // too large to pass as argument
+  //_TestScanByKeyWithLargeTypes<int,   64>();
+  //_TestScanByKeyWithLargeTypes<int,  128>();
+  //_TestScanByKeyWithLargeTypes<int,  256>();
+  //_TestScanByKeyWithLargeTypes<int,  512>();
+  //_TestScanByKeyWithLargeTypes<int, 1024>();
+}
+DECLARE_UNITTEST(TestScanByKeyWithLargeTypes);
diff --git a/testing/sequence.cu b/testing/sequence.cu
index cd3e17744..6d29db4c3 100644
--- a/testing/sequence.cu
+++ b/testing/sequence.cu
@@ -41,8 +41,9 @@ DECLARE_UNITTEST(TestSequenceDispatchImplicit);
 
 
 template <class Vector>
-void TestSequenceSimple(void)
+void TestSequenceSimple()
 {
+    using value_type = typename Vector::value_type;
     Vector v(5);
 
     thrust::sequence(v.begin(), v.end());
@@ -53,7 +54,7 @@ void TestSequenceSimple(void)
     ASSERT_EQUAL(v[3], 3);
     ASSERT_EQUAL(v[4], 4);
 
-    thrust::sequence(v.begin(), v.end(), 10);
+    thrust::sequence(v.begin(), v.end(), value_type{10});
 
     ASSERT_EQUAL(v[0], 10);
     ASSERT_EQUAL(v[1], 11);
@@ -61,7 +62,7 @@ void TestSequenceSimple(void)
     ASSERT_EQUAL(v[3], 13);
     ASSERT_EQUAL(v[4], 14);
     
-    thrust::sequence(v.begin(), v.end(), 10, 2);
+    thrust::sequence(v.begin(), v.end(), value_type{10}, value_type{2});
 
     ASSERT_EQUAL(v[0], 10);
     ASSERT_EQUAL(v[1], 12);
@@ -93,8 +94,8 @@ void TestSequence(size_t n)
 
     ASSERT_EQUAL(h_data, d_data);
     
-    thrust::sequence(h_data.begin(), h_data.end(), size_t(10), size_t(2));
-    thrust::sequence(d_data.begin(), d_data.end(), size_t(10), size_t(2));
+    thrust::sequence(h_data.begin(), h_data.end(), T(10), T(2));
+    thrust::sequence(d_data.begin(), d_data.end(), T(10), T(2));
 
     ASSERT_EQUAL(h_data, d_data);
 }
@@ -123,3 +124,47 @@ void TestSequenceComplex()
   thrust::sequence(m.begin(), m.end());
 }
 DECLARE_UNITTEST(TestSequenceComplex);
+
+// A class that doesnt accept conversion from size_t but can be multiplied by a scalar
+struct Vector
+{
+    Vector() = default;
+    // Explicitly disable construction from size_t
+    Vector(std::size_t) = delete;
+    __host__ __device__ Vector(int x_, int y_) : x{x_}, y{y_} {}
+    Vector(const Vector&) = default;
+    Vector &operator=(const Vector&) = default;
+
+    int x, y;
+};
+
+// Vector-Vector addition
+__host__ __device__ Vector operator+(const Vector a, const Vector b)
+{
+  return Vector{a.x + b.x, a.y + b.y};
+}
+
+// Vector-Scalar Multiplication
+// Multiplication by std::size_t is required by thrust::sequence.
+__host__ __device__ Vector operator*(const std::size_t a, const Vector b)
+{
+  return Vector{static_cast<int>(a) * b.x, static_cast<int>(a) * b.y};
+}
+__host__ __device__ Vector operator*(const Vector b, const std::size_t a)
+{
+  return Vector{static_cast<int>(a) * b.x, static_cast<int>(a) * b.y};
+}
+
+void TestSequenceNoSizeTConversion()
+{
+    thrust::device_vector<Vector> m(64);
+    thrust::sequence(m.begin(), m.end(), ::Vector{0, 0}, ::Vector{1, 2});
+
+    for (std::size_t i = 0; i < m.size(); ++i)
+    {
+        const ::Vector v = m[i];
+        ASSERT_EQUAL(static_cast<std::size_t>(v.x), i);
+        ASSERT_EQUAL(static_cast<std::size_t>(v.y), 2 * i);
+    }
+}
+DECLARE_UNITTEST(TestSequenceNoSizeTConversion);
diff --git a/testing/set_difference.cu b/testing/set_difference.cu
index b107bda36..5abc5f1fb 100644
--- a/testing/set_difference.cu
+++ b/testing/set_difference.cu
@@ -169,11 +169,11 @@ DECLARE_VARIABLE_UNITTEST(TestSetDifferenceEquivalentRanges);
 template<typename T>
 void TestSetDifferenceMultiset(const size_t n)
 {
-  thrust::host_vector<T> temp = unittest::random_integers<T>(2 * n);
+  thrust::host_vector<T> vec = unittest::random_integers<T>(2 * n);
 
   // restrict elements to [min,13)
-  for(typename thrust::host_vector<T>::iterator i = temp.begin();
-      i != temp.end();
+  for(typename thrust::host_vector<T>::iterator i = vec.begin();
+      i != vec.end();
       ++i)
   {
     int temp = static_cast<int>(*i);
@@ -181,8 +181,8 @@ void TestSetDifferenceMultiset(const size_t n)
     *i = temp;
   }
 
-  thrust::host_vector<T> h_a(temp.begin(), temp.begin() + n);
-  thrust::host_vector<T> h_b(temp.begin() + n, temp.end());
+  thrust::host_vector<T> h_a(vec.begin(), vec.begin() + n);
+  thrust::host_vector<T> h_b(vec.begin() + n, vec.end());
 
   thrust::sort(h_a.begin(), h_a.end());
   thrust::sort(h_b.begin(), h_b.end());
@@ -211,3 +211,32 @@ void TestSetDifferenceMultiset(const size_t n)
 }
 DECLARE_VARIABLE_UNITTEST(TestSetDifferenceMultiset);
 
+// FIXME: disabled on Windows, because it causes a failure on the internal CI system in one specific configuration.
+// That failure will be tracked in a new NVBug, this is disabled to unblock submitting all the other changes.
+#if THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC
+void TestSetDifferenceWithBigIndexesHelper(int magnitude)
+{
+    thrust::counting_iterator<long long> begin(0);
+    thrust::counting_iterator<long long> end = begin + (1ll << magnitude);
+    thrust::counting_iterator<long long> end_longer = end + 1;
+    ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude);
+
+    thrust::device_vector<long long> result;
+    result.resize(1);
+    thrust::set_difference(thrust::device, begin, end_longer, begin, end, result.begin());
+
+    thrust::host_vector<long long> expected;
+    expected.push_back(*end);
+
+    ASSERT_EQUAL(result, expected);
+}
+
+void TestSetDifferenceWithBigIndexes()
+{
+    TestSetDifferenceWithBigIndexesHelper(30);
+    TestSetDifferenceWithBigIndexesHelper(31);
+    TestSetDifferenceWithBigIndexesHelper(32);
+    TestSetDifferenceWithBigIndexesHelper(33);
+}
+DECLARE_UNITTEST(TestSetDifferenceWithBigIndexes);
+#endif
diff --git a/testing/set_difference_by_key.cu b/testing/set_difference_by_key.cu
index be68685fc..29dbb68fc 100644
--- a/testing/set_difference_by_key.cu
+++ b/testing/set_difference_by_key.cu
@@ -250,11 +250,11 @@ DECLARE_VARIABLE_UNITTEST(TestSetDifferenceByKeyEquivalentRanges);
 template<typename T>
 void TestSetDifferenceByKeyMultiset(const size_t n)
 {
-  thrust::host_vector<T> temp = unittest::random_integers<T>(2 * n);
+  thrust::host_vector<T> vec = unittest::random_integers<T>(2 * n);
 
   // restrict elements to [min,13)
-  for(typename thrust::host_vector<T>::iterator i = temp.begin();
-      i != temp.end();
+  for(typename thrust::host_vector<T>::iterator i = vec.begin();
+      i != vec.end();
       ++i)
   {
     int temp = static_cast<int>(*i);
@@ -262,8 +262,8 @@ void TestSetDifferenceByKeyMultiset(const size_t n)
     *i = temp;
   }
 
-  thrust::host_vector<T> h_a_key(temp.begin(), temp.begin() + n);
-  thrust::host_vector<T> h_b_key(temp.begin() + n, temp.end());
+  thrust::host_vector<T> h_a_key(vec.begin(), vec.begin() + n);
+  thrust::host_vector<T> h_b_key(vec.begin() + n, vec.end());
 
   thrust::sort(h_a_key.begin(), h_a_key.end());
   thrust::sort(h_b_key.begin(), h_b_key.end());
diff --git a/testing/set_intersection.cu b/testing/set_intersection.cu
index 3cae00f30..93ef05d74 100644
--- a/testing/set_intersection.cu
+++ b/testing/set_intersection.cu
@@ -209,20 +209,20 @@ DECLARE_VARIABLE_UNITTEST(TestSetIntersectionEquivalentRanges);
 template<typename T>
 void TestSetIntersectionMultiset(const size_t n)
 {
-  thrust::host_vector<T> temp = unittest::random_integers<T>(2 * n);
+  thrust::host_vector<T> vec = unittest::random_integers<T>(2 * n);
 
   // restrict elements to [min,13)
-  for(typename thrust::host_vector<T>::iterator i = temp.begin();
-      i != temp.end();
+  for(typename thrust::host_vector<T>::iterator i = vec.begin();
+      i != vec.end();
       ++i)
   {
-    int temp = static_cast<int>(*i);
-    temp %= 13;
-    *i = temp;
+    int tmp = static_cast<int>(*i);
+    tmp %= 13;
+    *i = static_cast<T>(tmp);
   }
 
-  thrust::host_vector<T> h_a(temp.begin(), temp.begin() + n);
-  thrust::host_vector<T> h_b(temp.begin() + n, temp.end());
+  thrust::host_vector<T> h_a(vec.begin(), vec.begin() + n);
+  thrust::host_vector<T> h_b(vec.begin() + n, vec.end());
 
   thrust::sort(h_a.begin(), h_a.end());
   thrust::sort(h_b.begin(), h_b.end());
@@ -251,3 +251,33 @@ void TestSetIntersectionMultiset(const size_t n)
 }
 DECLARE_VARIABLE_UNITTEST(TestSetIntersectionMultiset);
 
+// FIXME: disabled on Windows, because it causes a failure on the internal CI system in one specific configuration.
+// That failure will be tracked in a new NVBug, this is disabled to unblock submitting all the other changes.
+#if THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC
+void TestSetDifferenceWithBigIndexesHelper(int magnitude)
+{
+    thrust::counting_iterator<long long> begin1(0);
+    thrust::counting_iterator<long long> begin2 = begin1 + (1ll << magnitude);
+    thrust::counting_iterator<long long> end1 = begin2 + 1;
+    thrust::counting_iterator<long long> end2 = begin2 + (1ll << magnitude);
+    ASSERT_EQUAL(thrust::distance(begin2, end1), 1);
+
+    thrust::device_vector<long long> result;
+    result.resize(1);
+    thrust::set_intersection(thrust::device, begin1, end1, begin2, end2, result.begin());
+
+    thrust::host_vector<long long> expected;
+    expected.push_back(*begin2);
+
+    ASSERT_EQUAL(result, expected);
+}
+
+void TestSetDifferenceWithBigIndexes()
+{
+    TestSetDifferenceWithBigIndexesHelper(30);
+    TestSetDifferenceWithBigIndexesHelper(31);
+    TestSetDifferenceWithBigIndexesHelper(32);
+    TestSetDifferenceWithBigIndexesHelper(33);
+}
+DECLARE_UNITTEST(TestSetDifferenceWithBigIndexes);
+#endif
diff --git a/testing/set_intersection_by_key.cu b/testing/set_intersection_by_key.cu
index 6b7d51fc8..d82ee04ad 100644
--- a/testing/set_intersection_by_key.cu
+++ b/testing/set_intersection_by_key.cu
@@ -234,11 +234,11 @@ DECLARE_VARIABLE_UNITTEST(TestSetIntersectionByKeyEquivalentRanges);
 template<typename T>
 void TestSetIntersectionByKeyMultiset(const size_t n)
 {
-  thrust::host_vector<T> temp = unittest::random_integers<T>(2 * n);
+  thrust::host_vector<T> vec = unittest::random_integers<T>(2 * n);
 
   // restrict elements to [min,13)
-  for(typename thrust::host_vector<T>::iterator i = temp.begin();
-      i != temp.end();
+  for(typename thrust::host_vector<T>::iterator i = vec.begin();
+      i != vec.end();
       ++i)
   {
     int temp = static_cast<int>(*i);
@@ -246,8 +246,8 @@ void TestSetIntersectionByKeyMultiset(const size_t n)
     *i = temp;
   }
 
-  thrust::host_vector<T> h_a_key(temp.begin(), temp.begin() + n);
-  thrust::host_vector<T> h_b_key(temp.begin() + n, temp.end());
+  thrust::host_vector<T> h_a_key(vec.begin(), vec.begin() + n);
+  thrust::host_vector<T> h_b_key(vec.begin() + n, vec.end());
 
   thrust::sort(h_a_key.begin(), h_a_key.end());
   thrust::sort(h_b_key.begin(), h_b_key.end());
diff --git a/testing/set_symmetric_difference.cu b/testing/set_symmetric_difference.cu
index b3e3c1493..dde145fec 100644
--- a/testing/set_symmetric_difference.cu
+++ b/testing/set_symmetric_difference.cu
@@ -168,11 +168,11 @@ DECLARE_VARIABLE_UNITTEST(TestSetSymmetricDifferenceEquivalentRanges);
 template<typename T>
 void TestSetSymmetricDifferenceMultiset(const size_t n)
 {
-  thrust::host_vector<T> temp = unittest::random_integers<T>(2 * n);
+  thrust::host_vector<T> vec = unittest::random_integers<T>(2 * n);
 
   // restrict elements to [min,13)
-  for(typename thrust::host_vector<T>::iterator i = temp.begin();
-      i != temp.end();
+  for(typename thrust::host_vector<T>::iterator i = vec.begin();
+      i != vec.end();
       ++i)
   {
     int temp = static_cast<int>(*i);
@@ -180,8 +180,8 @@ void TestSetSymmetricDifferenceMultiset(const size_t n)
     *i = temp;
   }
 
-  thrust::host_vector<T> h_a(temp.begin(), temp.begin() + n);
-  thrust::host_vector<T> h_b(temp.begin() + n, temp.end());
+  thrust::host_vector<T> h_a(vec.begin(), vec.begin() + n);
+  thrust::host_vector<T> h_b(vec.begin() + n, vec.end());
 
   thrust::sort(h_a.begin(), h_a.end());
   thrust::sort(h_b.begin(), h_b.end());
diff --git a/testing/set_symmetric_difference_by_key.cu b/testing/set_symmetric_difference_by_key.cu
index c2688fdb8..98e416af8 100644
--- a/testing/set_symmetric_difference_by_key.cu
+++ b/testing/set_symmetric_difference_by_key.cu
@@ -254,11 +254,11 @@ DECLARE_VARIABLE_UNITTEST(TestSetSymmetricDifferenceByKeyEquivalentRanges);
 template<typename T>
 void TestSetSymmetricDifferenceByKeyMultiset(const size_t n)
 {
-  thrust::host_vector<T> temp = unittest::random_integers<T>(2 * n);
+  thrust::host_vector<T> vec = unittest::random_integers<T>(2 * n);
 
   // restrict elements to [min,13)
-  for(typename thrust::host_vector<T>::iterator i = temp.begin();
-      i != temp.end();
+  for(typename thrust::host_vector<T>::iterator i = vec.begin();
+      i != vec.end();
       ++i)
   {
     int temp = static_cast<int>(*i);
@@ -266,8 +266,8 @@ void TestSetSymmetricDifferenceByKeyMultiset(const size_t n)
     *i = temp;
   }
 
-  thrust::host_vector<T> h_a_key(temp.begin(), temp.begin() + n);
-  thrust::host_vector<T> h_b_key(temp.begin() + n, temp.end());
+  thrust::host_vector<T> h_a_key(vec.begin(), vec.begin() + n);
+  thrust::host_vector<T> h_b_key(vec.begin() + n, vec.end());
 
   thrust::sort(h_a_key.begin(), h_a_key.end());
   thrust::sort(h_b_key.begin(), h_b_key.end());
diff --git a/testing/set_union_by_key.cu b/testing/set_union_by_key.cu
index ec8864941..7d58ebf4f 100644
--- a/testing/set_union_by_key.cu
+++ b/testing/set_union_by_key.cu
@@ -254,11 +254,11 @@ DECLARE_VARIABLE_UNITTEST(TestSetUnionByKeyEquivalentRanges);
 template<typename T>
 void TestSetUnionByKeyMultiset(const size_t n)
 {
-  thrust::host_vector<T> temp = unittest::random_integers<T>(2 * n);
+  thrust::host_vector<T> vec = unittest::random_integers<T>(2 * n);
 
   // restrict elements to [min,13)
-  for(typename thrust::host_vector<T>::iterator i = temp.begin();
-      i != temp.end();
+  for(typename thrust::host_vector<T>::iterator i = vec.begin();
+      i != vec.end();
       ++i)
   {
     int temp = static_cast<int>(*i);
@@ -266,8 +266,8 @@ void TestSetUnionByKeyMultiset(const size_t n)
     *i = temp;
   }
 
-  thrust::host_vector<T> h_a_key(temp.begin(), temp.begin() + n);
-  thrust::host_vector<T> h_b_key(temp.begin() + n, temp.end());
+  thrust::host_vector<T> h_a_key(vec.begin(), vec.begin() + n);
+  thrust::host_vector<T> h_b_key(vec.begin() + n, vec.end());
 
   thrust::sort(h_a_key.begin(), h_a_key.end());
   thrust::sort(h_b_key.begin(), h_b_key.end());
diff --git a/testing/shuffle.cu b/testing/shuffle.cu
new file mode 100644
index 000000000..77e660c00
--- /dev/null
+++ b/testing/shuffle.cu
@@ -0,0 +1,602 @@
+#include <thrust/detail/config.h>
+
+#include <map>
+#include <limits>
+#include <thrust/random.h>
+#include <thrust/sequence.h>
+#include <thrust/shuffle.h>
+#include <thrust/sort.h>
+#include <unittest/unittest.h>
+
+// Functions for performing statistical tests of randomness
+// From NIST-Statistical-Test-Suite
+// Licence:
+//  "This software was developed at the National Institute of Standards and
+//  Technology by employees of the Federal Government in the course of their
+//  official duties. Pursuant to title 17 Section 105 of the United States Code
+//  this software is not subject to copyright protection and is in the public
+//  domain. The NIST Statistical Test Suite is an experimental system. NIST
+//  assumes no responsibility whatsoever for its use by other parties, and makes
+//  no guarantees, expressed or implied, about its quality, reliability, or any
+//  other characteristic. We would appreciate acknowledgment if the software is
+//  used."
+class CephesFunctions {
+public:
+  static double cephes_igamc(double a, double x) {
+    double ans, ax, c, yc, r, t, y, z;
+    double pk, pkm1, pkm2, qk, qkm1, qkm2;
+
+    if ((x <= 0) || (a <= 0))
+      return (1.0);
+
+    if ((x < 1.0) || (x < a))
+      return (1.e0 - cephes_igam(a, x));
+
+    ax = a * log(x) - x - cephes_lgam(a);
+
+    if (ax < -MAXLOG) {
+      printf("igamc: UNDERFLOW\n");
+      return 0.0;
+    }
+    ax = exp(ax);
+
+    /* continued fraction */
+    y = 1.0 - a;
+    z = x + y + 1.0;
+    c = 0.0;
+    pkm2 = 1.0;
+    qkm2 = x;
+    pkm1 = x + 1.0;
+    qkm1 = z * x;
+    ans = pkm1 / qkm1;
+
+    do {
+      c += 1.0;
+      y += 1.0;
+      z += 2.0;
+      yc = y * c;
+      pk = pkm1 * z - pkm2 * yc;
+      qk = qkm1 * z - qkm2 * yc;
+      if (qk != 0) {
+        r = pk / qk;
+        t = fabs((ans - r) / r);
+        ans = r;
+      } else
+        t = 1.0;
+      pkm2 = pkm1;
+      pkm1 = pk;
+      qkm2 = qkm1;
+      qkm1 = qk;
+      if (fabs(pk) > big) {
+        pkm2 *= biginv;
+        pkm1 *= biginv;
+        qkm2 *= biginv;
+        qkm1 *= biginv;
+      }
+    } while (t > MACHEP);
+
+    return ans * ax;
+  }
+
+private:
+  static constexpr double rel_error = 1E-12;
+
+  static constexpr double MACHEP = 1.11022302462515654042E-16;  // 2**-53
+  static constexpr double MAXLOG = 7.09782712893383996732224E2; // log(MAXNUM)
+  static constexpr double MAXNUM = 1.7976931348623158E308; // 2**1024*(1-MACHEP)
+  static constexpr double PI = 3.14159265358979323846;
+
+  static constexpr double big = 4.503599627370496e15;
+  static constexpr double biginv = 2.22044604925031308085e-16;
+
+  static int sgngam;
+
+  static double cephes_igam(double a, double x) {
+    double ans, ax, c, r;
+
+    if ((x <= 0) || (a <= 0))
+      return 0.0;
+
+    if ((x > 1.0) && (x > a))
+      return 1.e0 - cephes_igamc(a, x);
+
+    /* Compute  x**a * exp(-x) / gamma(a)  */
+    ax = a * log(x) - x - cephes_lgam(a);
+    if (ax < -MAXLOG) {
+      printf("igam: UNDERFLOW\n");
+      return 0.0;
+    }
+    ax = exp(ax);
+
+    /* power series */
+    r = a;
+    c = 1.0;
+    ans = 1.0;
+
+    do {
+      r += 1.0;
+      c *= x / r;
+      ans += c;
+    } while (c / ans > MACHEP);
+
+    return ans * ax / a;
+  }
+
+  /* A[]: Stirling's formula expansion of log gamma
+   * B[], C[]: log gamma function between 2 and 3
+   */
+  static constexpr double A[] = {
+      0.000811614167470508488140545910738410384510643780,
+      -0.000595061904284301438315674115386855191900394857,
+      0.000793650340457716942620114419781884862459264696,
+      -0.002777777777300996942672073330982129846233874559,
+      0.083333333333333189929525985917280195280909538269};
+  static constexpr double B[] = {
+      -1378.251525691208598800585605204105377197265625,
+      -38801.631513463784358464181423187255859375,
+      -331612.9927388711948879063129425048828125,
+      -1162370.97492762305773794651031494140625,
+      -1721737.00820839661173522472381591796875,
+      -853555.66424576542340219020843505859375};
+  static constexpr double C[] = {
+      -351.8157014365234545039129443466663360595703125,
+      -17064.21066518811494461260735988616943359375,
+      -220528.59055385444662533700466156005859375,
+      -1139334.44367982516996562480926513671875,
+      -2532523.07177582941949367523193359375,
+      -2018891.4143353276886045932769775390625};
+
+  static constexpr double MAXLGM = 2.556348e305;
+
+  /* Logarithm of gamma function */
+  static double cephes_lgam(double x) {
+    double p, q, u, w, z;
+    int i;
+
+    sgngam = 1;
+
+    if (x < -34.0) {
+      q = -x;
+      w = cephes_lgam(q); /* note this modifies sgngam! */
+      p = floor(q);
+      if (p == q) {
+      lgsing:
+        goto loverf;
+      }
+      i = (int)p;
+      if ((i & 1) == 0)
+        sgngam = -1;
+      else
+        sgngam = 1;
+      z = q - p;
+      if (z > 0.5) {
+        p += 1.0;
+        z = p - q;
+      }
+      z = q * sin(PI * z);
+      if (z == 0.0)
+        goto lgsing;
+      /*      z = log(PI) - log( z ) - w;*/
+      z = log(PI) - log(z) - w;
+      return z;
+    }
+
+    if (x < 13.0) {
+      z = 1.0;
+      p = 0.0;
+      u = x;
+      while (u >= 3.0) {
+        p -= 1.0;
+        u = x + p;
+        z *= u;
+      }
+      while (u < 2.0) {
+        if (u == 0.0)
+          goto lgsing;
+        z /= u;
+        p += 1.0;
+        u = x + p;
+      }
+      if (z < 0.0) {
+        sgngam = -1;
+        z = -z;
+      } else
+        sgngam = 1;
+      if (u == 2.0)
+        return (log(z));
+      p -= 2.0;
+      x = x + p;
+      p = x * cephes_polevl(x, B, 5) /
+          cephes_p1evl(x, C, 6);
+
+      return log(z) + p;
+    }
+
+    if (x > MAXLGM) {
+    loverf:
+      printf("lgam: OVERFLOW\n");
+
+      return sgngam * MAXNUM;
+    }
+
+    q = (x - 0.5) * log(x) - x + log(sqrt(2 * PI));
+    if (x > 1.0e8)
+      return q;
+
+    p = 1.0 / (x * x);
+    if (x >= 1000.0)
+      q +=
+          ((7.9365079365079365079365e-4 * p - 2.7777777777777777777778e-3) * p +
+           0.0833333333333333333333) /
+          x;
+    else
+      q += cephes_polevl(p, A, 4) / x;
+
+    return q;
+  }
+
+  static double cephes_polevl(double x, const double *coef, int N) {
+    const double *p = coef;
+    double ans = *p++;
+    int i = N;
+    do
+      ans = ans * x + *p++;
+    while (--i);
+
+    return ans;
+  }
+
+  static double cephes_p1evl(double x, const double *coef, int N) {
+    const double *p = coef;
+    double ans = x + *p++;
+    int i = N - 1;
+
+    do
+      ans = ans * x + *p++;
+    while (--i);
+
+    return ans;
+  }
+
+  static double cephes_erf(double x) {
+    static const double two_sqrtpi = 1.128379167095512574;
+    double sum = x, term = x, xsqr = x * x;
+    int j = 1;
+
+    if (fabs(x) > 2.2)
+      return 1.0 - cephes_erfc(x);
+
+    do {
+      term *= xsqr / j;
+      sum -= term / (2 * j + 1);
+      j++;
+      term *= xsqr / j;
+      sum += term / (2 * j + 1);
+      j++;
+    } while (fabs(term) / sum > rel_error);
+
+    return two_sqrtpi * sum;
+  }
+
+  static double cephes_erfc(double x) {
+    static const double one_sqrtpi = 0.564189583547756287;
+    double a = 1, b = x, c = x, d = x * x + 0.5;
+    double q1, q2 = b / d, n = 1.0, t;
+
+    if (fabs(x) < 2.2)
+      return 1.0 - cephes_erf(x);
+    if (x < 0)
+      return 2.0 - cephes_erfc(-x);
+
+    do {
+      t = a * n + b * x;
+      a = b;
+      b = t;
+      t = c * n + d * x;
+      c = d;
+      d = t;
+      n += 0.5;
+      q1 = q2;
+      q2 = b / d;
+    } while (fabs(q1 - q2) / q2 > rel_error);
+
+    return one_sqrtpi * exp(-x * x) * q2;
+  }
+
+  static double cephes_normal(double x) {
+    double arg, result, sqrt2 = 1.414213562373095048801688724209698078569672;
+
+    if (x > 0) {
+      arg = x / sqrt2;
+      result = 0.5 * (1 + erf(arg));
+    } else {
+      arg = -x / sqrt2;
+      result = 0.5 * (1 - erf(arg));
+    }
+
+    return (result);
+  }
+};
+int CephesFunctions::sgngam = 0;
+constexpr double CephesFunctions::A[];
+constexpr double CephesFunctions::B[];
+constexpr double CephesFunctions::C[];
+
+template <typename Vector>
+void TestShuffleSimple() {
+  Vector data(5);
+  data[0] = 0;
+  data[1] = 1;
+  data[2] = 2;
+  data[3] = 3;
+  data[4] = 4;
+  Vector shuffled(data.begin(), data.end());
+  thrust::default_random_engine g(2);
+  thrust::shuffle(shuffled.begin(), shuffled.end(), g);
+  thrust::sort(shuffled.begin(), shuffled.end());
+  // Check all of our data is present
+  // This only tests for strange conditions like duplicated elements
+  ASSERT_EQUAL(shuffled, data);
+}
+DECLARE_VECTOR_UNITTEST(TestShuffleSimple);
+
+template <typename Vector>
+void TestShuffleCopySimple() {
+  Vector data(5);
+  data[0] = 0;
+  data[1] = 1;
+  data[2] = 2;
+  data[3] = 3;
+  data[4] = 4;
+  Vector shuffled(5);
+  thrust::default_random_engine g(2);
+  thrust::shuffle_copy(data.begin(), data.end(), shuffled.begin(), g);
+  g.seed(2);
+  thrust::shuffle(data.begin(), data.end(), g);
+  ASSERT_EQUAL(shuffled, data);
+}
+DECLARE_VECTOR_UNITTEST(TestShuffleCopySimple);
+
+template <typename T>
+void TestHostDeviceIdentical(size_t m) {
+  thrust::host_vector<T> host_result(m);
+  thrust::host_vector<T> device_result(m);
+  thrust::sequence(host_result.begin(), host_result.end(), T{});
+  thrust::sequence(device_result.begin(), device_result.end(), T{});
+
+  thrust::default_random_engine host_g(183);
+  thrust::default_random_engine device_g(183);
+
+  thrust::shuffle(host_result.begin(), host_result.end(), host_g);
+  thrust::shuffle(device_result.begin(), device_result.end(), device_g);
+
+  ASSERT_EQUAL(device_result, host_result);
+}
+DECLARE_VARIABLE_UNITTEST(TestHostDeviceIdentical);
+
+template <typename T>
+void TestFunctionIsBijection(size_t m) {
+  thrust::default_random_engine host_g(0xD5);
+  thrust::default_random_engine device_g(0xD5);
+
+  thrust::system::detail::generic::feistel_bijection host_f(m, host_g);
+  thrust::system::detail::generic::feistel_bijection device_f(m, device_g);
+
+  if (static_cast<double>(host_f.nearest_power_of_two()) >= static_cast<double>(std::numeric_limits<T>::max()) || m == 0) {
+    return;
+  }
+
+  thrust::host_vector<T> host_result(host_f.nearest_power_of_two());
+  thrust::host_vector<T> device_result(device_f.nearest_power_of_two());
+  thrust::sequence(host_result.begin(), host_result.end(), T{});
+  thrust::sequence(device_result.begin(), device_result.end(), T{});
+
+  thrust::transform(host_result.begin(), host_result.end(), host_result.begin(),
+                    host_f);
+  thrust::transform(device_result.begin(), device_result.end(),
+                    device_result.begin(), device_f);
+
+  ASSERT_EQUAL(host_result, device_result);
+
+  thrust::sort(host_result.begin(), host_result.end());
+  // Assert all values were generated exactly once
+  for (uint64_t i = 0; i < m; i++) {
+    ASSERT_EQUAL((uint64_t)host_result[i], i);
+  }
+}
+DECLARE_VARIABLE_UNITTEST(TestFunctionIsBijection);
+
+void TestBijectionLength() {
+  thrust::default_random_engine g(0xD5);
+
+  uint64_t m = 31;
+  thrust::system::detail::generic::feistel_bijection f(m, g);
+  ASSERT_EQUAL(f.nearest_power_of_two(), uint64_t(32));
+
+  m = 32;
+  f = thrust::system::detail::generic::feistel_bijection(m, g);
+  ASSERT_EQUAL(f.nearest_power_of_two(), uint64_t(32));
+
+  m = 1;
+  f = thrust::system::detail::generic::feistel_bijection(m, g);
+  ASSERT_EQUAL(f.nearest_power_of_two(), uint64_t(16));
+}
+DECLARE_UNITTEST(TestBijectionLength);
+
+// Individual input keys should be permuted to output locations with uniform
+// probability. Perform chi-squared test with confidence 99.9%.
+template <typename Vector>
+void TestShuffleKeyPosition() {
+  typedef typename Vector::value_type T;
+  size_t m = 20;
+  size_t num_samples = 100;
+  thrust::host_vector<size_t> index_sum(m, 0);
+  thrust::host_vector<T> sequence(m);
+  thrust::sequence(sequence.begin(), sequence.end(), T(0));
+
+  thrust::default_random_engine g(0xD5);
+  for (size_t i = 0; i < num_samples; i++) {
+    Vector shuffled(sequence.begin(), sequence.end());
+    thrust::shuffle(shuffled.begin(), shuffled.end(), g);
+    thrust::host_vector<T> tmp(shuffled.begin(), shuffled.end());
+
+    for (auto j = 0ull; j < m; j++) {
+      index_sum[tmp[j]] += j;
+    }
+  }
+
+  double expected_average_position = static_cast<double>(m - 1) / 2;
+  double chi_squared = 0.0;
+  for (auto j = 0ull; j < m; j++) {
+    double average_position = static_cast<double>(index_sum[j]) / num_samples;
+    chi_squared += std::pow(expected_average_position - average_position, 2) /
+                   expected_average_position;
+  }
+  // Tabulated chi-squared critical value for m-1=19 degrees of freedom
+  // and 99.9% confidence
+  double confidence_threshold = 43.82;
+  ASSERT_LESS(chi_squared, confidence_threshold);
+}
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestShuffleKeyPosition);
+
+struct vector_compare {
+  template <typename VectorT>
+  bool operator()(const VectorT &a, const VectorT &b) const {
+    for (auto i = 0ull; i < a.size(); i++) {
+      if (a[i] < b[i])
+        return true;
+      if (a[i] > b[i])
+        return false;
+    }
+    return false;
+  }
+};
+
+// Brute force check permutations are uniformly distributed on small input
+// Uses a chi-squared test indicating 99% confidence the output is uniformly
+// random
+template <typename Vector>
+void TestShuffleUniformPermutation() {
+  typedef typename Vector::value_type T;
+
+  size_t m = 5;
+  size_t num_samples = 1000;
+  size_t total_permutations = 1 * 2 * 3 * 4 * 5;
+  std::map<thrust::host_vector<T>, size_t, vector_compare> permutation_counts;
+  Vector sequence(m);
+  thrust::sequence(sequence.begin(), sequence.end(), T(0));
+  thrust::default_random_engine g(0xD5);
+  for (auto i = 0ull; i < num_samples; i++) {
+    thrust::shuffle(sequence.begin(), sequence.end(), g);
+    thrust::host_vector<T> tmp(sequence.begin(), sequence.end());
+    permutation_counts[tmp]++;
+  }
+
+  ASSERT_EQUAL(permutation_counts.size(), total_permutations);
+
+  double chi_squared = 0.0;
+  double expected_count = static_cast<double>(num_samples) / total_permutations;
+  for (auto kv : permutation_counts) {
+    chi_squared += std::pow(expected_count - kv.second, 2) / expected_count;
+  }
+  double p_score = CephesFunctions::cephes_igamc(
+      (double)(total_permutations - 1) / 2.0, chi_squared / 2.0);
+  ASSERT_GREATER(p_score, 0.01);
+}
+DECLARE_VECTOR_UNITTEST(TestShuffleUniformPermutation);
+
+template <typename Vector>
+void TestShuffleEvenSpacingBetweenOccurances() {
+  typedef typename Vector::value_type T;
+  const uint64_t shuffle_size = 10;
+  const uint64_t num_samples = 1000;
+
+  thrust::host_vector<T> h_results;
+  Vector sequence(shuffle_size);
+  thrust::sequence(sequence.begin(), sequence.end(), 0);
+  thrust::default_random_engine g(0xD6);
+  for (auto i = 0ull; i < num_samples; i++) {
+    thrust::shuffle(sequence.begin(), sequence.end(), g);
+    thrust::host_vector<T> tmp(sequence.begin(), sequence.end());
+    h_results.insert(h_results.end(), sequence.begin(), sequence.end());
+  }
+
+  std::vector<std::vector<std::vector<uint64_t>>> distance_between(
+      num_samples, std::vector<std::vector<uint64_t>>(
+                       num_samples, std::vector<uint64_t>(shuffle_size, 0)));
+
+  for (uint64_t sample = 0; sample < num_samples; sample++) {
+    for (uint64_t i = 0; i < shuffle_size - 1; i++) {
+      for (uint64_t j = 1; j < shuffle_size - i; j++) {
+        T val_1 = h_results[sample * shuffle_size + i];
+        T val_2 = h_results[sample * shuffle_size + i + j];
+        distance_between[val_1][val_2][j]++;
+        distance_between[val_2][val_1][shuffle_size - j]++;
+      }
+    }
+  }
+
+  const double expected_occurances = (double)num_samples / (shuffle_size - 1);
+  for (uint64_t val_1 = 0; val_1 < shuffle_size; val_1++) {
+    for (uint64_t val_2 = val_1 + 1; val_2 < shuffle_size; val_2++) {
+      double chi_squared = 0.0;
+      auto &distances = distance_between[val_1][val_2];
+      for (uint64_t i = 1; i < shuffle_size; i++) {
+        chi_squared += std::pow((double)distances[i] - expected_occurances, 2) /
+                       expected_occurances;
+      }
+
+      double p_score = CephesFunctions::cephes_igamc(
+          (double)(shuffle_size - 2) / 2.0, chi_squared / 2.0);
+      ASSERT_GREATER(p_score, 0.01);
+    }
+  }
+}
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestShuffleEvenSpacingBetweenOccurances);
+
+template <typename Vector>
+void TestShuffleEvenDistribution() {
+  typedef typename Vector::value_type T;
+  const uint64_t shuffle_sizes[] = {10, 100, 500};
+  thrust::default_random_engine g(0xD5);
+  for (auto shuffle_size : shuffle_sizes) {
+    if(shuffle_size > (uint64_t)std::numeric_limits<T>::max())
+      continue;
+    const uint64_t num_samples = shuffle_size == 500 ? 1000 : 200;
+
+    std::vector<uint64_t> counts(shuffle_size * shuffle_size, 0);
+    Vector sequence(shuffle_size);
+    for (auto i = 0ull; i < num_samples; i++) {
+      thrust::sequence(sequence.begin(), sequence.end(), 0);
+      thrust::shuffle(sequence.begin(), sequence.end(), g);
+      thrust::host_vector<T> tmp(sequence.begin(), sequence.end());
+      for (uint64_t j = 0; j < shuffle_size; j++) {
+        assert(j < tmp.size());
+        counts.at(j * shuffle_size + tmp[j])++;
+      }
+    }
+
+    const double expected_occurances = (double)num_samples / shuffle_size;
+    for (uint64_t i = 0; i < shuffle_size; i++) {
+      double chi_squared_pos = 0.0;
+      double chi_squared_num = 0.0;
+      for (uint64_t j = 0; j < shuffle_size; j++) {
+        auto count_pos = counts.at(i * shuffle_size + j);
+        auto count_num = counts.at(j * shuffle_size + i);
+        chi_squared_pos +=
+            pow((double)count_pos - expected_occurances, 2) / expected_occurances;
+        chi_squared_num +=
+            pow((double)count_num - expected_occurances, 2) / expected_occurances;
+      }
+
+      double p_score_pos = CephesFunctions::cephes_igamc(
+          (double)(shuffle_size - 1) / 2.0, chi_squared_pos / 2.0);
+      ASSERT_GREATER(p_score_pos, 0.001 / (double)shuffle_size);
+
+      double p_score_num = CephesFunctions::cephes_igamc(
+          (double)(shuffle_size - 1) / 2.0, chi_squared_num / 2.0);
+      ASSERT_GREATER(p_score_num, 0.001 / (double)shuffle_size);
+    }
+  }
+}
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestShuffleEvenDistribution);
diff --git a/testing/stable_sort_by_key_large.cu b/testing/stable_sort_by_key_large.cu
deleted file mode 100644
index fc69de64c..000000000
--- a/testing/stable_sort_by_key_large.cu
+++ /dev/null
@@ -1,155 +0,0 @@
-#include <unittest/unittest.h>
-#include <thrust/sort.h>
-#include <thrust/functional.h>
-
-template <typename T>
-struct less_div_10
-{
-  __host__ __device__ bool operator()(const T &lhs, const T &rhs) const {return ((int) lhs) / 10 < ((int) rhs) / 10;}
-};
-
-template <typename T>
-struct greater_div_10
-{
-  __host__ __device__ bool operator()(const T &lhs, const T &rhs) const {return ((int) lhs) / 10 > ((int) rhs) / 10;}
-};
-
-
-template <typename T, unsigned int N>
-void _TestStableSortByKeyWithLargeKeys(void)
-{
-    size_t n = (128 * 1024) / sizeof(FixedVector<T,N>);
-
-    thrust::host_vector< FixedVector<T,N> > h_keys(n);
-    thrust::host_vector<   unsigned int   > h_vals(n);
-
-    for(size_t i = 0; i < n; i++)
-    {
-        h_keys[i] = FixedVector<T,N>(rand());
-        h_vals[i] = i;
-    }
-
-    thrust::device_vector< FixedVector<T,N> > d_keys = h_keys;
-    thrust::device_vector<   unsigned int   > d_vals = h_vals;
-    
-    thrust::stable_sort_by_key(h_keys.begin(), h_keys.end(), h_vals.begin());
-    thrust::stable_sort_by_key(d_keys.begin(), d_keys.end(), d_vals.begin());
-
-    ASSERT_EQUAL_QUIET(h_keys, d_keys);
-    ASSERT_EQUAL_QUIET(h_vals, d_vals);
-}
-
-void TestStableSortByKeyWithLargeKeys(void)
-{
-    _TestStableSortByKeyWithLargeKeys<int,    4>();
-    _TestStableSortByKeyWithLargeKeys<int,    8>();
-    _TestStableSortByKeyWithLargeKeys<int,   16>();
-
-// XXX these take too long to compile
-//    _TestStableSortByKeyWithLargeKeys<int,   32>();
-//    _TestStableSortByKeyWithLargeKeys<int,   64>();
-//    _TestStableSortByKeyWithLargeKeys<int,  128>();
-//    _TestStableSortByKeyWithLargeKeys<int,  256>();
-//    _TestStableSortByKeyWithLargeKeys<int,  512>();
-//    _TestStableSortByKeyWithLargeKeys<int, 1024>();
-//    _TestStableSortByKeyWithLargeKeys<int, 2048>();
-//    _TestStableSortByKeyWithLargeKeys<int, 4096>();
-//    _TestStableSortByKeyWithLargeKeys<int, 8192>();
-}
-DECLARE_UNITTEST(TestStableSortByKeyWithLargeKeys);
-
-
-template <typename T, unsigned int N>
-void _TestStableSortByKeyWithLargeValues(void)
-{
-    size_t n = (128 * 1024) / sizeof(FixedVector<T,N>);
-
-    thrust::host_vector<   unsigned int   > h_keys(n);
-    thrust::host_vector< FixedVector<T,N> > h_vals(n);
-
-    for(size_t i = 0; i < n; i++)
-    {
-        h_keys[i] = rand();
-        h_vals[i] = FixedVector<T,N>(i);
-    }
-
-    thrust::device_vector<   unsigned int   > d_keys = h_keys;
-    thrust::device_vector< FixedVector<T,N> > d_vals = h_vals;
-    
-    thrust::stable_sort_by_key(h_keys.begin(), h_keys.end(), h_vals.begin());
-    thrust::stable_sort_by_key(d_keys.begin(), d_keys.end(), d_vals.begin());
-
-    ASSERT_EQUAL_QUIET(h_keys, d_keys);
-    ASSERT_EQUAL_QUIET(h_vals, d_vals);
-
-    // so cuda::stable_merge_sort_by_key() is called
-    thrust::stable_sort_by_key(h_keys.begin(), h_keys.end(), h_vals.begin(), greater_div_10<unsigned int>());
-    thrust::stable_sort_by_key(d_keys.begin(), d_keys.end(), d_vals.begin(), greater_div_10<unsigned int>());
-
-    ASSERT_EQUAL_QUIET(h_keys, d_keys);
-    ASSERT_EQUAL_QUIET(h_vals, d_vals);
-}
-
-void TestStableSortByKeyWithLargeValues(void)
-{
-    _TestStableSortByKeyWithLargeValues<int,    4>();
-    _TestStableSortByKeyWithLargeValues<int,    8>();
-    _TestStableSortByKeyWithLargeValues<int,   16>();
-    
-// XXX these take too long to compile
-//    _TestStableSortByKeyWithLargeValues<int,   32>();
-//    _TestStableSortByKeyWithLargeValues<int,   64>();
-//    _TestStableSortByKeyWithLargeValues<int,  128>();
-//    _TestStableSortByKeyWithLargeValues<int,  256>();
-//    _TestStableSortByKeyWithLargeValues<int,  512>();
-//    _TestStableSortByKeyWithLargeValues<int, 1024>();
-//    _TestStableSortByKeyWithLargeValues<int, 2048>();
-//    _TestStableSortByKeyWithLargeValues<int, 4096>();
-//    _TestStableSortByKeyWithLargeValues<int, 8192>();
-}
-DECLARE_UNITTEST(TestStableSortByKeyWithLargeValues);
-
-
-template <typename T, unsigned int N>
-void _TestStableSortByKeyWithLargeKeysAndValues(void)
-{
-    size_t n = (128 * 1024) / sizeof(FixedVector<T,N>);
-
-    thrust::host_vector< FixedVector<T,N> > h_keys(n);
-    thrust::host_vector< FixedVector<T,N> > h_vals(n);
-
-    for(size_t i = 0; i < n; i++)
-    {
-        h_keys[i] = FixedVector<T,N>(rand());
-        h_vals[i] = FixedVector<T,N>(i);
-    }
-
-    thrust::device_vector< FixedVector<T,N> > d_keys = h_keys;
-    thrust::device_vector< FixedVector<T,N> > d_vals = h_vals;
-    
-    thrust::stable_sort_by_key(h_keys.begin(), h_keys.end(), h_vals.begin());
-    thrust::stable_sort_by_key(d_keys.begin(), d_keys.end(), d_vals.begin());
-
-    ASSERT_EQUAL_QUIET(h_keys, d_keys);
-    ASSERT_EQUAL_QUIET(h_vals, d_vals);
-}
-
-void TestStableSortByKeyWithLargeKeysAndValues(void)
-{
-    _TestStableSortByKeyWithLargeKeysAndValues<int,    4>();
-    _TestStableSortByKeyWithLargeKeysAndValues<int,    8>();
-    _TestStableSortByKeyWithLargeKeysAndValues<int,   16>();
-
-// XXX these take too long to compile
-//    _TestStableSortByKeyWithLargeKeysAndValues<int,   32>();
-//    _TestStableSortByKeyWithLargeKeysAndValues<int,   64>();
-//    _TestStableSortByKeyWithLargeKeysAndValues<int,  128>();
-//    _TestStableSortByKeyWithLargeKeysAndValues<int,  256>();
-//    _TestStableSortByKeyWithLargeKeysAndValues<int,  512>();
-//    _TestStableSortByKeyWithLargeKeysAndValues<int, 1024>();
-//    _TestStableSortByKeyWithLargeKeysAndValues<int, 2048>();
-//    _TestStableSortByKeyWithLargeKeysAndValues<int, 4096>();
-//    _TestStableSortByKeyWithLargeKeysAndValues<int, 8192>();
-}
-DECLARE_UNITTEST(TestStableSortByKeyWithLargeKeysAndValues);
-
diff --git a/testing/stable_sort_by_key_large_keys.cu b/testing/stable_sort_by_key_large_keys.cu
new file mode 100644
index 000000000..9ea4d51f8
--- /dev/null
+++ b/testing/stable_sort_by_key_large_keys.cu
@@ -0,0 +1,38 @@
+#include <thrust/functional.h>
+#include <thrust/sort.h>
+
+#include <unittest/unittest.h>
+
+template <unsigned int N>
+void _TestStableSortByKeyWithLargeKeys(void)
+{
+  size_t n = (128 * 1024) / sizeof(FixedVector<int, N>);
+
+  thrust::host_vector<FixedVector<int, N>> h_keys(n);
+  thrust::host_vector<unsigned int> h_vals(n);
+
+  for (size_t i = 0; i < n; i++)
+  {
+    const auto uint_i   = static_cast<unsigned int>(i);
+    const auto rand_int = unittest::generate_random_integer<int>()(uint_i);
+    h_keys[i]           = FixedVector<int, N>(rand_int);
+    h_vals[i]           = uint_i;
+  }
+
+  thrust::device_vector<FixedVector<int, N>> d_keys = h_keys;
+  thrust::device_vector<unsigned int> d_vals        = h_vals;
+
+  thrust::stable_sort_by_key(h_keys.begin(), h_keys.end(), h_vals.begin());
+  thrust::stable_sort_by_key(d_keys.begin(), d_keys.end(), d_vals.begin());
+
+  ASSERT_EQUAL_QUIET(h_keys, d_keys);
+  ASSERT_EQUAL_QUIET(h_vals, d_vals);
+}
+
+void TestStableSortByKeyWithLargeKeys(void)
+{
+  _TestStableSortByKeyWithLargeKeys<4>();
+  _TestStableSortByKeyWithLargeKeys<8>();
+  _TestStableSortByKeyWithLargeKeys<16>();
+}
+DECLARE_UNITTEST(TestStableSortByKeyWithLargeKeys);
diff --git a/testing/stable_sort_by_key_large_keys_and_values.cu b/testing/stable_sort_by_key_large_keys_and_values.cu
new file mode 100644
index 000000000..eed6b6efa
--- /dev/null
+++ b/testing/stable_sort_by_key_large_keys_and_values.cu
@@ -0,0 +1,38 @@
+#include <thrust/functional.h>
+#include <thrust/sort.h>
+
+#include <unittest/unittest.h>
+
+template <unsigned int N>
+void _TestStableSortByKeyWithLargeKeysAndValues()
+{
+  size_t n = (128 * 1024) / sizeof(FixedVector<int, N>);
+
+  thrust::host_vector<FixedVector<int, N>> h_keys(n);
+  thrust::host_vector<FixedVector<int, N>> h_vals(n);
+
+  for (size_t i = 0; i < n; i++)
+  {
+    const auto uint_i   = static_cast<unsigned int>(i);
+    const auto rand_int = unittest::generate_random_integer<int>()(uint_i);
+    h_keys[i]           = FixedVector<int, N>(rand_int);
+    h_vals[i]           = FixedVector<int, N>(static_cast<int>(i));
+  }
+
+  thrust::device_vector<FixedVector<int, N>> d_keys = h_keys;
+  thrust::device_vector<FixedVector<int, N>> d_vals = h_vals;
+
+  thrust::stable_sort_by_key(h_keys.begin(), h_keys.end(), h_vals.begin());
+  thrust::stable_sort_by_key(d_keys.begin(), d_keys.end(), d_vals.begin());
+
+  ASSERT_EQUAL_QUIET(h_keys, d_keys);
+  ASSERT_EQUAL_QUIET(h_vals, d_vals);
+}
+
+void TestStableSortByKeyWithLargeKeysAndValues()
+{
+  _TestStableSortByKeyWithLargeKeysAndValues<4>();
+  _TestStableSortByKeyWithLargeKeysAndValues<8>();
+  _TestStableSortByKeyWithLargeKeysAndValues<16>();
+}
+DECLARE_UNITTEST(TestStableSortByKeyWithLargeKeysAndValues);
diff --git a/testing/stable_sort_by_key_large_values.cu b/testing/stable_sort_by_key_large_values.cu
new file mode 100644
index 000000000..b37753973
--- /dev/null
+++ b/testing/stable_sort_by_key_large_values.cu
@@ -0,0 +1,60 @@
+#include <thrust/functional.h>
+#include <thrust/sort.h>
+
+#include <unittest/unittest.h>
+
+template <typename T>
+struct greater_div_10
+{
+  __host__ __device__ bool operator()(const T &lhs, const T &rhs) const
+  {
+    return ((int)lhs) / 10 > ((int)rhs) / 10;
+  }
+};
+
+template <unsigned int N>
+void _TestStableSortByKeyWithLargeValues()
+{
+  size_t n = (128 * 1024) / sizeof(FixedVector<int, N>);
+
+  thrust::host_vector<unsigned int> h_keys(n);
+  thrust::host_vector<FixedVector<int, N>> h_vals(n);
+
+  for (size_t i = 0; i < n; i++)
+  {
+    const auto uint_i   = static_cast<unsigned int>(i);
+    const auto rand_int = unittest::generate_random_integer<unsigned int>()(uint_i);
+    h_keys[i]           = rand_int;
+    h_vals[i]           = FixedVector<int, N>(static_cast<int>(i));
+  }
+
+  thrust::device_vector<unsigned int> d_keys        = h_keys;
+  thrust::device_vector<FixedVector<int, N>> d_vals = h_vals;
+
+  thrust::stable_sort_by_key(h_keys.begin(), h_keys.end(), h_vals.begin());
+  thrust::stable_sort_by_key(d_keys.begin(), d_keys.end(), d_vals.begin());
+
+  ASSERT_EQUAL_QUIET(h_keys, d_keys);
+  ASSERT_EQUAL_QUIET(h_vals, d_vals);
+
+  // so cuda::stable_merge_sort_by_key() is called
+  thrust::stable_sort_by_key(h_keys.begin(),
+                             h_keys.end(),
+                             h_vals.begin(),
+                             greater_div_10<unsigned int>());
+  thrust::stable_sort_by_key(d_keys.begin(),
+                             d_keys.end(),
+                             d_vals.begin(),
+                             greater_div_10<unsigned int>());
+
+  ASSERT_EQUAL_QUIET(h_keys, d_keys);
+  ASSERT_EQUAL_QUIET(h_vals, d_vals);
+}
+
+void TestStableSortByKeyWithLargeValues()
+{
+  _TestStableSortByKeyWithLargeValues<4>();
+  _TestStableSortByKeyWithLargeValues<8>();
+  _TestStableSortByKeyWithLargeValues<16>();
+}
+DECLARE_UNITTEST(TestStableSortByKeyWithLargeValues);
diff --git a/testing/stable_sort_large.cu b/testing/stable_sort_large.cu
index 6b6b78b88..2b1907cea 100644
--- a/testing/stable_sort_large.cu
+++ b/testing/stable_sort_large.cu
@@ -24,22 +24,9 @@ void _TestStableSortWithLargeKeys(void)
 
 void TestStableSortWithLargeKeys(void)
 {
-    _TestStableSortWithLargeKeys<int,    1>();
     _TestStableSortWithLargeKeys<int,    2>();
-    _TestStableSortWithLargeKeys<int,    4>();
-    _TestStableSortWithLargeKeys<int,    8>();
-    _TestStableSortWithLargeKeys<int,   16>();
-    _TestStableSortWithLargeKeys<int,   32>();
-    _TestStableSortWithLargeKeys<int,   64>();
+    _TestStableSortWithLargeKeys<int,   17>();
     _TestStableSortWithLargeKeys<int,  128>();
-    _TestStableSortWithLargeKeys<int,  256>();
-
-// XXX these take too long to compile
-//    _TestStableSortWithLargeKeys<int,  512>();
-//    _TestStableSortWithLargeKeys<int, 1024>();
-//    _TestStableSortWithLargeKeys<int, 2048>();
-//    _TestStableSortWithLargeKeys<int, 4096>();
-//    _TestStableSortWithLargeKeys<int, 8192>();
 }
 DECLARE_UNITTEST(TestStableSortWithLargeKeys);
 
diff --git a/testing/swap_ranges.cu b/testing/swap_ranges.cu
index a2d061fe3..843c66240 100644
--- a/testing/swap_ranges.cu
+++ b/testing/swap_ranges.cu
@@ -1,6 +1,6 @@
 #include <unittest/unittest.h>
 #include <thrust/swap.h>
-#include <thrust/iterator/iterator_traits.h> 
+#include <thrust/iterator/iterator_traits.h>
 #include <thrust/iterator/retag.h>
 #include <thrust/system/cpp/memory.h>
 
@@ -68,7 +68,7 @@ void TestSwapRangesSimple(void)
     ASSERT_EQUAL(v1[2], 7);
     ASSERT_EQUAL(v1[3], 8);
     ASSERT_EQUAL(v1[4], 9);
-    
+
     ASSERT_EQUAL(v2[0], 0);
     ASSERT_EQUAL(v2[1], 1);
     ASSERT_EQUAL(v2[2], 2);
@@ -88,11 +88,11 @@ void TestSwapRanges(const size_t n)
     thrust::host_vector<T>    h2 = a2;
     thrust::device_vector<T>  d1 = a1;
     thrust::device_vector<T>  d2 = a2;
-  
+
     thrust::swap_ranges(h1.begin(), h1.end(), h2.begin());
     thrust::swap_ranges(d1.begin(), d1.end(), d2.begin());
 
-    ASSERT_EQUAL(h1, a2);  
+    ASSERT_EQUAL(h1, a2);
     ASSERT_EQUAL(d1, a2);
     ASSERT_EQUAL(h2, a1);
     ASSERT_EQUAL(d2, a1);
@@ -147,6 +147,10 @@ struct type_with_swap
     return m_x == other.m_x && m_swapped == other.m_swapped;
   }
 
+#if THRUST_CPP_DIALECT >= 2011
+  type_with_swap & operator=(const type_with_swap &) = default;
+#endif
+
   int m_x;
   bool m_swapped;
 };
diff --git a/testing/transform_input_output_iterator.cu b/testing/transform_input_output_iterator.cu
new file mode 100644
index 000000000..7df163077
--- /dev/null
+++ b/testing/transform_input_output_iterator.cu
@@ -0,0 +1,122 @@
+#include <unittest/unittest.h>
+#include <thrust/iterator/transform_input_output_iterator.h>
+
+#include <thrust/copy.h>
+#include <thrust/reduce.h>
+#include <thrust/functional.h>
+#include <thrust/sequence.h>
+#include <thrust/iterator/counting_iterator.h>
+
+template <class Vector>
+void TestTransformInputOutputIterator(void)
+{
+    typedef typename Vector::value_type T;
+
+    typedef thrust::negate<T> InputFunction;
+    typedef thrust::square<T> OutputFunction;
+    typedef typename Vector::iterator Iterator;
+
+    Vector input(4);
+    Vector squared(4);
+    Vector negated(4);
+    
+    // initialize input
+    thrust::sequence(input.begin(), input.end(), 1);
+   
+    // construct transform_iterator
+    thrust::transform_input_output_iterator<InputFunction, OutputFunction, Iterator>
+        transform_iter(squared.begin(), InputFunction(), OutputFunction());
+
+    // transform_iter writes squared value
+    thrust::copy(input.begin(), input.end(), transform_iter);
+
+    Vector gold_squared(4);
+    gold_squared[0] = 1;
+    gold_squared[1] = 4;
+    gold_squared[2] = 9;
+    gold_squared[3] = 16;
+
+    ASSERT_EQUAL(squared, gold_squared);
+
+    // negated value read from transform_iter
+    thrust::copy_n(transform_iter, squared.size(), negated.begin());
+
+    Vector gold_negated(4);
+    gold_negated[0] = -1;
+    gold_negated[1] = -4;
+    gold_negated[2] = -9;
+    gold_negated[3] = -16;
+
+    ASSERT_EQUAL(negated, gold_negated);
+
+}
+DECLARE_VECTOR_UNITTEST(TestTransformInputOutputIterator);
+
+template <class Vector>
+void TestMakeTransformInputOutputIterator(void)
+{
+    typedef typename Vector::value_type T;
+
+    typedef thrust::negate<T> InputFunction;
+    typedef thrust::square<T> OutputFunction;
+
+    Vector input(4);
+    Vector negated(4);
+    Vector squared(4);
+    
+    // initialize input
+    thrust::sequence(input.begin(), input.end(), 1);
+
+    // negated value read from transform iterator
+    thrust::copy_n(thrust::make_transform_input_output_iterator(input.begin(), InputFunction(), OutputFunction()),
+                   input.size(), negated.begin());
+
+    Vector gold_negated(4);
+    gold_negated[0] = -1;
+    gold_negated[1] = -2;
+    gold_negated[2] = -3;
+    gold_negated[3] = -4;
+
+    ASSERT_EQUAL(negated, gold_negated);
+
+    // squared value writen by transform iterator
+    thrust::copy(negated.begin(), negated.end(),
+                 thrust::make_transform_input_output_iterator(squared.begin(), InputFunction(), OutputFunction()));
+
+    Vector gold_squared(4);
+    gold_squared[0] = 1;
+    gold_squared[1] = 4;
+    gold_squared[2] = 9;
+    gold_squared[3] = 16;
+
+    ASSERT_EQUAL(squared, gold_squared);
+
+}
+DECLARE_VECTOR_UNITTEST(TestMakeTransformInputOutputIterator);
+
+template <typename T>
+struct TestTransformInputOutputIteratorScan
+{
+    void operator()(const size_t n)
+    {
+        thrust::host_vector<T>   h_data = unittest::random_samples<T>(n);
+        thrust::device_vector<T> d_data = h_data;
+
+        thrust::host_vector<T>   h_result(n);
+        thrust::device_vector<T> d_result(n);
+
+        // run on host (uses forward iterator negate)
+        thrust::inclusive_scan(thrust::make_transform_input_output_iterator(h_data.begin(), thrust::negate<T>(), thrust::identity<T>()),
+                               thrust::make_transform_input_output_iterator(h_data.end(),   thrust::negate<T>(), thrust::identity<T>()),
+                               h_result.begin());
+        // run on device (uses reverse iterator negate)
+        thrust::inclusive_scan(d_data.begin(), d_data.end(),
+                               thrust::make_transform_input_output_iterator(
+                                   d_result.begin(), thrust::square<T>(), thrust::negate<T>()));
+
+
+        ASSERT_EQUAL(h_result, d_result);
+    }
+};
+VariableUnitTest<TestTransformInputOutputIteratorScan, IntegralTypes> TestTransformInputOutputIteratorScanInstance;
+
diff --git a/testing/transform_iterator.cu b/testing/transform_iterator.cu
index e28e333e1..a960a0b44 100644
--- a/testing/transform_iterator.cu
+++ b/testing/transform_iterator.cu
@@ -7,6 +7,8 @@
 #include <thrust/sequence.h>
 #include <thrust/iterator/counting_iterator.h>
 
+#include <memory>
+
 template <class Vector>
 void TestTransformIterator(void)
 {
@@ -84,3 +86,28 @@ struct TestTransformIteratorReduce
 };
 VariableUnitTest<TestTransformIteratorReduce, IntegralTypes> TestTransformIteratorReduceInstance;
 
+
+struct ExtractValue{
+    int operator()(std::unique_ptr<int> const& n){
+        return *n;
+    }
+};
+
+void TestTransformIteratorNonCopyable(){
+
+    thrust::host_vector<std::unique_ptr<int>> hv(4);
+    hv[0].reset(new int{1});
+    hv[1].reset(new int{2});
+    hv[2].reset(new int{3});
+    hv[3].reset(new int{4});
+
+    auto transformed = thrust::make_transform_iterator(hv.begin(), ExtractValue{});
+    ASSERT_EQUAL(transformed[0], 1);
+    ASSERT_EQUAL(transformed[1], 2);
+    ASSERT_EQUAL(transformed[2], 3);
+    ASSERT_EQUAL(transformed[3], 4);
+
+}
+
+DECLARE_UNITTEST(TestTransformIteratorNonCopyable);
+
diff --git a/testing/transform_output_iterator.cu b/testing/transform_output_iterator.cu
index cdeb950f1..27f8b53bd 100644
--- a/testing/transform_output_iterator.cu
+++ b/testing/transform_output_iterator.cu
@@ -1,25 +1,27 @@
 #include <unittest/unittest.h>
-#include <thrust/iterator/transform_output_iterator.h>
 
 #include <thrust/copy.h>
-#include <thrust/reduce.h>
+#include <thrust/device_vector.h>
 #include <thrust/functional.h>
-#include <thrust/sequence.h>
+#include <thrust/host_vector.h>
 #include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_output_iterator.h>
+#include <thrust/reduce.h>
+#include <thrust/sequence.h>
 
 template <class Vector>
 void TestTransformOutputIterator(void)
 {
     typedef typename Vector::value_type T;
 
-    typedef thrust::negate<T> UnaryFunction;
+    typedef thrust::square<T> UnaryFunction;
     typedef typename Vector::iterator Iterator;
 
     Vector input(4);
     Vector output(4);
     
     // initialize input
-    thrust::sequence(input.begin(), input.end(), 1);
+    thrust::sequence(input.begin(), input.end(), T{1});
    
     // construct transform_iterator
     thrust::transform_output_iterator<UnaryFunction, Iterator> output_iter(output.begin(), UnaryFunction());
@@ -27,10 +29,10 @@ void TestTransformOutputIterator(void)
     thrust::copy(input.begin(), input.end(), output_iter);
 
     Vector gold_output(4);
-    gold_output[0] = -1;
-    gold_output[1] = -2;
-    gold_output[2] = -3;
-    gold_output[3] = -4;
+    gold_output[0] = 1;
+    gold_output[1] = 4;
+    gold_output[2] = 9;
+    gold_output[3] = 16;
 
     ASSERT_EQUAL(output, gold_output);
 
@@ -42,7 +44,7 @@ void TestMakeTransformOutputIterator(void)
 {
     typedef typename Vector::value_type T;
 
-    typedef thrust::negate<T> UnaryFunction;
+    typedef thrust::square<T> UnaryFunction;
 
     Vector input(4);
     Vector output(4);
@@ -54,11 +56,10 @@ void TestMakeTransformOutputIterator(void)
                  thrust::make_transform_output_iterator(output.begin(), UnaryFunction()));
 
     Vector gold_output(4);
-    gold_output[0] = -1;
-    gold_output[1] = -2;
-    gold_output[2] = -3;
-    gold_output[3] = -4;
-
+    gold_output[0] = 1;
+    gold_output[1] = 4;
+    gold_output[2] = 9;
+    gold_output[3] = 16;
     ASSERT_EQUAL(output, gold_output);
 
 }
@@ -88,5 +89,5 @@ struct TestTransformOutputIteratorScan
         ASSERT_EQUAL(h_result, d_result);
     }
 };
-VariableUnitTest<TestTransformOutputIteratorScan, IntegralTypes> TestTransformOutputIteratorScanInstance;
+VariableUnitTest<TestTransformOutputIteratorScan, SignedIntegralTypes> TestTransformOutputIteratorScanInstance;
 
diff --git a/testing/transform_output_iterator_reduce_by_key.cu b/testing/transform_output_iterator_reduce_by_key.cu
new file mode 100644
index 000000000..f7004f8c7
--- /dev/null
+++ b/testing/transform_output_iterator_reduce_by_key.cu
@@ -0,0 +1,51 @@
+#include <unittest/unittest.h>
+
+#include <thrust/copy.h>
+#include <thrust/device_vector.h>
+#include <thrust/functional.h>
+#include <thrust/host_vector.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/iterator/transform_output_iterator.h>
+#include <thrust/reduce.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+
+
+template <typename T>
+struct TestTransformOutputIteratorReduceByKey
+{
+  void operator()(const size_t n)
+  {
+    thrust::host_vector<T> h_keys = unittest::random_samples<T>(n);
+    thrust::sort(h_keys.begin(), h_keys.end());
+    thrust::device_vector<T> d_keys = h_keys;
+
+    thrust::host_vector<T> h_values   = unittest::random_samples<T>(n);
+    thrust::device_vector<T> d_values = h_values;
+
+    thrust::host_vector<T> h_result(n);
+    thrust::device_vector<T> d_result(n);
+
+    // run on host
+    thrust::reduce_by_key(thrust::host,
+                          h_keys.begin(),
+                          h_keys.end(),
+                          thrust::make_transform_iterator(h_values.begin(), thrust::negate<T>()),
+                          thrust::discard_iterator<T>{},
+                          h_result.begin());
+    // run on device
+    thrust::reduce_by_key(thrust::device,
+                          d_keys.begin(),
+                          d_keys.end(),
+                          d_values.begin(),
+                          thrust::discard_iterator<T>{},
+                          thrust::make_transform_output_iterator(d_result.begin(),
+                                                                 thrust::negate<T>()));
+
+    ASSERT_EQUAL(h_result, d_result);
+  }
+};
+VariableUnitTest<TestTransformOutputIteratorReduceByKey, SignedIntegralTypes>
+  TestTransformOutputIteratorReduceByKeyInstance;
+
diff --git a/testing/transform_scan.cu b/testing/transform_scan.cu
index 2e6633923..2b6e35a2a 100644
--- a/testing/transform_scan.cu
+++ b/testing/transform_scan.cu
@@ -190,6 +190,61 @@ void TestTransformScanSimple(void)
 }
 DECLARE_INTEGRAL_VECTOR_UNITTEST(TestTransformScanSimple);
 
+struct Record {
+    int number;
+
+    bool operator==(const Record& rhs) const {
+        return number == rhs.number;
+    }
+    bool operator!=(const Record& rhs) const {
+        return !(rhs == *this);
+    }
+    friend Record operator+(Record lhs, const Record& rhs) {
+        lhs.number += rhs.number;
+        return lhs;
+    }
+    friend std::ostream& operator<<(std::ostream& os, const Record& record) {
+        os << "number: " << record.number;
+        return os;
+    }
+};
+
+struct negate {
+    __host__ __device__ int operator()(Record const& record) const
+    {
+        return - record.number;
+    }
+};
+
+void TestTransformInclusiveScanDifferentTypes()
+{
+    typename thrust::host_vector<int>::iterator h_iter;
+
+    thrust::host_vector<Record> h_input(5);
+    thrust::host_vector<int> h_output(5);
+    thrust::host_vector<int> result(5);
+
+    h_input[0] = {1}; h_input[1] = {3}; h_input[2] = {-2}; h_input[3] = {4}; h_input[4] = {-5};
+
+    thrust::host_vector<Record> input_copy(h_input);
+
+    h_iter = thrust::transform_inclusive_scan(h_input.begin(), h_input.end(), h_output.begin(), negate{}, thrust::plus<int>{});
+    result[0] = -1; result[1] = -4; result[2] = -2; result[3] = -6; result[4] = -1;
+    ASSERT_EQUAL(std::size_t(h_iter - h_output.begin()), h_input.size());
+    ASSERT_EQUAL(h_input, input_copy);
+    ASSERT_EQUAL(h_output, result);
+
+    typename thrust::device_vector<int>::iterator d_iter;
+
+    thrust::device_vector<Record> d_input = h_input;
+    thrust::device_vector<int> d_output(5);
+
+    d_iter = thrust::transform_inclusive_scan(d_input.begin(), d_input.end(), d_output.begin(), negate{}, thrust::plus<int>{});
+    ASSERT_EQUAL(std::size_t(d_iter - d_output.begin()), d_input.size());
+    ASSERT_EQUAL(d_input, input_copy);
+    ASSERT_EQUAL(d_output, result);
+}
+DECLARE_UNITTEST(TestTransformInclusiveScanDifferentTypes);
 
 template <typename T>
 struct TestTransformScan
@@ -292,3 +347,55 @@ struct TestTransformScanToDiscardIterator
 };
 VariableUnitTest<TestTransformScanToDiscardIterator, IntegralTypes> TestTransformScanToDiscardIteratorInstance;
 
+// Regression test for https://github.com/NVIDIA/thrust/issues/1332
+// The issue was the internal transform_input_iterator_t created by the
+// transform_inclusive_scan implementation was instantiated using a reference
+// type for the value_type.
+template <typename T>
+void TestValueCategoryDeduction()
+{
+    thrust::device_vector<T> vec;
+
+    T a_h[10] = {5, 0, 5, 8, 6, 7, 5, 3, 0, 9};
+    vec.assign((T*)a_h, a_h + 10);
+
+
+    thrust::transform_inclusive_scan(thrust::device,
+                                     vec.cbegin(),
+                                     vec.cend(),
+                                     vec.begin(),
+                                     thrust::identity<>{},
+                                     thrust::maximum<>{});
+
+    ASSERT_EQUAL(T{5}, vec[0]);
+    ASSERT_EQUAL(T{5}, vec[1]);
+    ASSERT_EQUAL(T{5}, vec[2]);
+    ASSERT_EQUAL(T{8}, vec[3]);
+    ASSERT_EQUAL(T{8}, vec[4]);
+    ASSERT_EQUAL(T{8}, vec[5]);
+    ASSERT_EQUAL(T{8}, vec[6]);
+    ASSERT_EQUAL(T{8}, vec[7]);
+    ASSERT_EQUAL(T{8}, vec[8]);
+    ASSERT_EQUAL(T{9}, vec[9]);
+
+    vec.assign((T*)a_h, a_h + 10);
+    thrust::transform_exclusive_scan(thrust::device,
+                                     vec.cbegin(),
+                                     vec.cend(),
+                                     vec.begin(),
+                                     thrust::identity<>{},
+                                     T{},
+                                     thrust::maximum<>{});
+
+    ASSERT_EQUAL(T{0}, vec[0]);
+    ASSERT_EQUAL(T{5}, vec[1]);
+    ASSERT_EQUAL(T{5}, vec[2]);
+    ASSERT_EQUAL(T{5}, vec[3]);
+    ASSERT_EQUAL(T{8}, vec[4]);
+    ASSERT_EQUAL(T{8}, vec[5]);
+    ASSERT_EQUAL(T{8}, vec[6]);
+    ASSERT_EQUAL(T{8}, vec[7]);
+    ASSERT_EQUAL(T{8}, vec[8]);
+    ASSERT_EQUAL(T{8}, vec[9]);
+}
+DECLARE_GENERIC_UNITTEST(TestValueCategoryDeduction);
diff --git a/testing/tuple_algorithms.cu b/testing/tuple_algorithms.cu
index 1a7b48dec..449fdc2f1 100644
--- a/testing/tuple_algorithms.cu
+++ b/testing/tuple_algorithms.cu
@@ -5,26 +5,58 @@
 #include <unittest/unittest.h>
 
 #include <thrust/detail/tuple_algorithms.h>
+#include <thrust/type_traits/integer_sequence.h>
 
 // FIXME: Replace with C++14 style `thrust::square<>` when we have it.
 struct custom_square
 {
   template <typename T>
+  __host__ __device__
   T operator()(T v) const
   {
-    return v * v; 
+    return v * v;
   }
 };
 
+struct custom_square_inplace
+{
+  template <typename T>
+  __host__ __device__
+  void operator()(T& v) const
+  {
+    v *= v;
+  }
+};
+
+void test_tuple_subset()
+{
+  auto t0 = std::make_tuple(0, 2, 3.14);
+
+  auto t1 = thrust::tuple_subset(t0, thrust::index_sequence<2, 0>{});
+
+  ASSERT_EQUAL_QUIET(t1, std::make_tuple(3.14, 0));
+}
+DECLARE_UNITTEST(test_tuple_subset);
+
 void test_tuple_transform()
 {
   auto t0 = std::make_tuple(0, 2, 3.14);
 
-  auto t1 = thrust::tuple_transform(t0, custom_square{}); 
+  auto t1 = thrust::tuple_transform(t0, custom_square{});
 
   ASSERT_EQUAL_QUIET(t1, std::make_tuple(0, 4, 9.8596));
 }
 DECLARE_UNITTEST(test_tuple_transform);
- 
+
+void test_tuple_for_each()
+{
+  auto t = std::make_tuple(0, 2, 3.14);
+
+  thrust::tuple_for_each(t, custom_square_inplace{});
+
+  ASSERT_EQUAL_QUIET(t, std::make_tuple(0, 4, 9.8596));
+}
+DECLARE_UNITTEST(test_tuple_for_each);
+
 #endif // THRUST_CPP_DIALECT >= 2011
 
diff --git a/testing/tuple_scan.cu b/testing/tuple_scan.cu
index c15b81751..d0565d6d4 100644
--- a/testing/tuple_scan.cu
+++ b/testing/tuple_scan.cu
@@ -58,18 +58,6 @@ struct TestTupleScan
      inclusive_scan(d_input.begin(), d_input.end(), d_output.begin(), SumTupleFunctor());
      ASSERT_EQUAL_QUIET(h_output, d_output);
 
-    // The tests below get miscompiled on Tesla hw for 8b types
-
-#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
-    if(const CUDATestDriver *driver = dynamic_cast<const CUDATestDriver*>(&UnitTestDriver::s_driver()))
-    {
-      if(sizeof(T) == sizeof(unittest::uint8_t) && driver->current_device_architecture() < 200)
-      {
-        KNOWN_FAILURE;
-      } // end if
-    } // end if
-#endif
-
      // exclusive_scan
      tuple<T,T> init(13,17);
      exclusive_scan(h_input.begin(), h_input.end(), h_output.begin(), init, SumTupleFunctor());
diff --git a/testing/uninitialized_copy.cu b/testing/uninitialized_copy.cu
index 7455d8c81..62a79cdc9 100644
--- a/testing/uninitialized_copy.cu
+++ b/testing/uninitialized_copy.cu
@@ -3,6 +3,7 @@
 #include <thrust/device_malloc_allocator.h>
 #include <thrust/iterator/retag.h>
 
+#include <nv/target>
 
 template<typename InputIterator, typename ForwardIterator>
 ForwardIterator uninitialized_copy(my_system &system,
@@ -147,13 +148,13 @@ struct CopyConstructTest
   __host__ __device__
   CopyConstructTest(const CopyConstructTest &)
   {
-#if __CUDA_ARCH__
-    copy_constructed_on_device = true;
-    copy_constructed_on_host   = false;
-#else
-    copy_constructed_on_device = false;
-    copy_constructed_on_device = true;
-#endif
+    NV_IF_TARGET(NV_IS_DEVICE, (
+      copy_constructed_on_device = true;
+      copy_constructed_on_host   = false;
+    ), (
+      copy_constructed_on_device = false;
+      copy_constructed_on_host = true;
+    ));
   }
 
   __host__ __device__
diff --git a/testing/uninitialized_fill.cu b/testing/uninitialized_fill.cu
index 5e0d53c72..8fbb97002 100644
--- a/testing/uninitialized_fill.cu
+++ b/testing/uninitialized_fill.cu
@@ -3,6 +3,7 @@
 #include <thrust/device_malloc_allocator.h>
 #include <thrust/iterator/retag.h>
 
+#include <nv/target>
 
 template<typename ForwardIterator, typename T>
 void uninitialized_fill(my_system &system,
@@ -147,6 +148,7 @@ DECLARE_VECTOR_UNITTEST(TestUninitializedFillPOD);
 
 struct CopyConstructTest
 {
+  __host__ __device__
   CopyConstructTest(void)
     :copy_constructed_on_host(false),
      copy_constructed_on_device(false)
@@ -155,13 +157,13 @@ struct CopyConstructTest
   __host__ __device__
   CopyConstructTest(const CopyConstructTest &)
   {
-#if __CUDA_ARCH__
-    copy_constructed_on_device = true;
-    copy_constructed_on_host   = false;
-#else
-    copy_constructed_on_device = false;
-    copy_constructed_on_host   = true;
-#endif
+    NV_IF_TARGET(NV_IS_DEVICE, (
+      copy_constructed_on_device = true;
+      copy_constructed_on_host   = false;
+    ), (
+      copy_constructed_on_device = false;
+      copy_constructed_on_host   = true;
+    ));
   }
 
   __host__ __device__
diff --git a/testing/unique.cu b/testing/unique.cu
index 8073832df..7df2def87 100644
--- a/testing/unique.cu
+++ b/testing/unique.cu
@@ -95,6 +95,50 @@ void TestUniqueCopyDispatchImplicit()
 DECLARE_UNITTEST(TestUniqueCopyDispatchImplicit);
 
 
+template <typename ForwardIterator>
+typename thrust::iterator_traits<ForwardIterator>::difference_type
+    unique_count(my_system &system,
+                 ForwardIterator,
+                 ForwardIterator)
+{
+    system.validate_dispatch();
+    return 0;
+}
+
+void TestUniqueCountDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::unique_count(sys, vec.begin(), vec.begin());
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestUniqueCountDispatchExplicit);
+
+
+template <typename ForwardIterator>
+typename thrust::iterator_traits<ForwardIterator>::difference_type
+    unique_count(my_tag,
+                 ForwardIterator,
+                 ForwardIterator)
+{
+    return 13;
+}
+
+void TestUniqueCountDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    auto result = thrust::unique_count(
+        thrust::retag<my_tag>(vec.begin()),
+        thrust::retag<my_tag>(vec.begin()));
+
+    ASSERT_EQUAL(13, result);
+}
+DECLARE_UNITTEST(TestUniqueCountDispatchImplicit);
+
+
 template<typename T>
 struct is_equal_div_10_unique
 {
@@ -266,3 +310,48 @@ struct TestUniqueCopyToDiscardIterator
 VariableUnitTest<TestUniqueCopyToDiscardIterator, IntegralTypes> TestUniqueCopyToDiscardIteratorInstance;
 
 
+template <typename Vector>
+void TestUniqueCountSimple(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector data(10);
+    data[0] = 11;
+    data[1] = 11;
+    data[2] = 12;
+    data[3] = 20;
+    data[4] = 29;
+    data[5] = 21;
+    data[6] = 21;
+    data[7] = 31;
+    data[8] = 31;
+    data[9] = 37;
+
+    int count = thrust::unique_count(data.begin(), data.end());
+
+    ASSERT_EQUAL(count, 7);
+
+    int div_10_count = thrust::unique_count(data.begin(), data.end(), is_equal_div_10_unique<T>());
+
+    ASSERT_EQUAL(div_10_count, 3);
+}
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestUniqueCountSimple);
+
+template <typename T>
+struct TestUniqueCount
+{
+    void operator()(const size_t n)
+    {
+        thrust::host_vector<T> h_data = unittest::random_integers<bool>(n);
+        thrust::device_vector<T> d_data = h_data;
+
+        int h_count{};
+        int d_count{};
+
+        h_count = thrust::unique_count(h_data.begin(), h_data.end());
+        d_count = thrust::unique_count(d_data.begin(), d_data.end());
+
+        ASSERT_EQUAL(h_count, d_count);
+    }
+};
+VariableUnitTest<TestUniqueCount, IntegralTypes> TestUniqueCountInstance;
diff --git a/testing/unittest/CMakeLists.txt b/testing/unittest/CMakeLists.txt
new file mode 100644
index 000000000..4c0eb66cb
--- /dev/null
+++ b/testing/unittest/CMakeLists.txt
@@ -0,0 +1,24 @@
+foreach(thrust_target IN LISTS THRUST_TARGETS)
+  thrust_get_target_property(config_device ${thrust_target} DEVICE)
+  thrust_get_target_property(config_prefix ${thrust_target} PREFIX)
+
+  set(framework_target ${config_prefix}.test.framework)
+
+  if ("CUDA" STREQUAL "${config_device}")
+    set(framework_srcs
+      testframework.cu
+      cuda/testframework.cu
+    )
+  else()
+    # Wrap the cu file inside a .cpp file for non-CUDA builds
+    thrust_wrap_cu_in_cpp(framework_srcs testframework.cu ${thrust_target})
+  endif()
+
+  add_library(${framework_target} STATIC ${framework_srcs})
+  target_link_libraries(${framework_target} PUBLIC ${thrust_target})
+  target_include_directories(${framework_target} PRIVATE "${Thrust_SOURCE_DIR}/testing")
+  thrust_clone_target_properties(${framework_target} ${thrust_target})
+
+  thrust_fix_clang_nvcc_build_for(${framework_target})
+
+endforeach()
diff --git a/testing/unittest/assertions.h b/testing/unittest/assertions.h
index 6803e8168..855d705a4 100644
--- a/testing/unittest/assertions.h
+++ b/testing/unittest/assertions.h
@@ -3,6 +3,7 @@
 #include <thrust/complex.h>
 #include <thrust/host_vector.h>
 #include <thrust/device_vector.h>
+#include <thrust/universal_vector.h>
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/detail/type_traits.h>
 
@@ -98,15 +99,15 @@ double const DEFAULT_ABSOLUTE_TOL = 1e-4;
 template<typename T>
   struct value_type
 {
-  typedef typename thrust::detail::remove_const<
-    typename thrust::detail::remove_reference<
+  typedef typename THRUST_NS_QUALIFIER::detail::remove_const<
+    typename THRUST_NS_QUALIFIER::detail::remove_reference<
       T
     >::type
   >::type type;
 };
 
 template<typename T>
-  struct value_type< thrust::device_reference<T> >
+  struct value_type< THRUST_NS_QUALIFIER::device_reference<T> >
 {
   typedef typename value_type<T>::type type;
 };
@@ -327,7 +328,7 @@ void assert_almost_equal(T1 a, T2 b,
 
 
 template <typename T1, typename T2>
-void assert_almost_equal(thrust::complex<T1> a, thrust::complex<T2> b,
+void assert_almost_equal(THRUST_NS_QUALIFIER::complex<T1> a, THRUST_NS_QUALIFIER::complex<T2> b,
                          const std::string& filename = "unknown", int lineno = -1,
                          double a_tol = DEFAULT_ABSOLUTE_TOL, double r_tol = DEFAULT_RELATIVE_TOL)
 
@@ -343,7 +344,7 @@ void assert_almost_equal(thrust::complex<T1> a, thrust::complex<T2> b,
 
 
 template <typename T1, typename T2>
-  void assert_almost_equal(const thrust::complex<T1>& a, const std::complex<T2>& b,
+  void assert_almost_equal(const THRUST_NS_QUALIFIER::complex<T1>& a, const std::complex<T2>& b,
                          const std::string& filename = "unknown", int lineno = -1,
                          double a_tol = DEFAULT_ABSOLUTE_TOL, double r_tol = DEFAULT_RELATIVE_TOL)
 
@@ -370,13 +371,13 @@ class almost_equal_to
 
 
 template <typename T>
-class almost_equal_to<thrust::complex<T> >
+class almost_equal_to<THRUST_NS_QUALIFIER::complex<T> >
 {
     public:
         double a_tol, r_tol;
         almost_equal_to(double _a_tol = DEFAULT_ABSOLUTE_TOL, double _r_tol = DEFAULT_RELATIVE_TOL) : a_tol(_a_tol), r_tol(_r_tol) {}
-        bool operator()(const thrust::complex<T>& a, const thrust::complex<T>& b) const {
-            return almost_equal((double) a.real(), (double) b.real(), a_tol, r_tol) 
+        bool operator()(const THRUST_NS_QUALIFIER::complex<T>& a, const THRUST_NS_QUALIFIER::complex<T>& b) const {
+            return almost_equal((double) a.real(), (double) b.real(), a_tol, r_tol)
                 && almost_equal((double) a.imag(), (double) b.imag(), a_tol, r_tol);
         }
 };
@@ -388,15 +389,15 @@ template <typename ForwardIterator1, typename ForwardIterator2, typename BinaryP
 void assert_equal(ForwardIterator1 first1, ForwardIterator1 last1, ForwardIterator2 first2, ForwardIterator2 last2, BinaryPredicate op,
                   const std::string& filename = "unknown", int lineno = -1)
 {
-    typedef typename thrust::iterator_difference<ForwardIterator1>::type difference_type;
-    typedef typename thrust::iterator_value<ForwardIterator1>::type InputType;
-    
+    typedef typename THRUST_NS_QUALIFIER::iterator_difference<ForwardIterator1>::type difference_type;
+    typedef typename THRUST_NS_QUALIFIER::iterator_value<ForwardIterator1>::type InputType;
+
     bool failure = false;
 
-    difference_type length1 = thrust::distance(first1, last1);
-    difference_type length2 = thrust::distance(first2, last2);
-    
-    difference_type min_length = thrust::min(length1, length2);
+    difference_type length1 = THRUST_NS_QUALIFIER::distance(first1, last1);
+    difference_type length2 = THRUST_NS_QUALIFIER::distance(first2, last2);
+
+    difference_type min_length = THRUST_NS_QUALIFIER::min(length1, length2);
 
     unittest::UnitTestFailure f;
     f << "[" << filename << ":" << lineno << "] ";
@@ -409,7 +410,7 @@ void assert_equal(ForwardIterator1 first1, ForwardIterator1 last1, ForwardIterat
     }
 
     // check values
-    
+
     size_t mismatches = 0;
 
     for (difference_type i = 0; i < min_length; i++)
@@ -427,10 +428,14 @@ void assert_equal(ForwardIterator1 first1, ForwardIterator1 last1, ForwardIterat
 
         if(mismatches <= MAX_OUTPUT_LINES)
         {
-          if (sizeof(InputType) == 1)
+          THRUST_IF_CONSTEXPR(sizeof(InputType) == 1)
+          {
             f << "  [" << i << "] " << *first1 + InputType() << "  " << *first2 + InputType() << "\n"; // unprintable chars are a problem
+          }
           else
+          {
             f << "  [" << i << "] " << *first1 << "  " << *first2 << "\n";
+          }
         }
       }
 
@@ -458,8 +463,8 @@ template <typename ForwardIterator1, typename ForwardIterator2>
 void assert_equal(ForwardIterator1 first1, ForwardIterator1 last1, ForwardIterator2 first2, ForwardIterator2 last2,
                   const std::string& filename = "unknown", int lineno = -1)
 {
-    typedef typename thrust::iterator_traits<ForwardIterator1>::value_type InputType;
-    assert_equal(first1, last1, first2, last2, thrust::equal_to<InputType>(), filename, lineno);
+    typedef typename THRUST_NS_QUALIFIER::iterator_traits<ForwardIterator1>::value_type InputType;
+    assert_equal(first1, last1, first2, last2, THRUST_NS_QUALIFIER::equal_to<InputType>(), filename, lineno);
 }
 
 
@@ -468,79 +473,190 @@ void assert_almost_equal(ForwardIterator1 first1, ForwardIterator1 last1, Forwar
                          const std::string& filename = "unknown", int lineno = -1,
                          const double a_tol = DEFAULT_ABSOLUTE_TOL, const double r_tol = DEFAULT_RELATIVE_TOL)
 {
-    typedef typename thrust::iterator_traits<ForwardIterator1>::value_type InputType;
+    typedef typename THRUST_NS_QUALIFIER::iterator_traits<ForwardIterator1>::value_type InputType;
     assert_equal(first1, last1, first2, last2, almost_equal_to<InputType>(a_tol, r_tol), filename, lineno);
 }
 
+template <typename T, typename Alloc1, typename Alloc2>
+void assert_equal(const THRUST_NS_QUALIFIER::host_vector<T,Alloc1>& A,
+                  const THRUST_NS_QUALIFIER::host_vector<T,Alloc2>& B,
+                  const std::string& filename = "unknown", int lineno = -1)
+{
+    assert_equal(A.begin(), A.end(), B.begin(), B.end(), filename, lineno);
+}
 
 template <typename T, typename Alloc1, typename Alloc2>
-void assert_equal(const thrust::host_vector<T,Alloc1>& A, const thrust::host_vector<T,Alloc2>& B,
+void assert_equal(const THRUST_NS_QUALIFIER::host_vector<T,Alloc1>& A,
+                  const THRUST_NS_QUALIFIER::device_vector<T,Alloc2>& B,
+                  const std::string& filename = "unknown", int lineno = -1)
+{
+    THRUST_NS_QUALIFIER::host_vector<T,Alloc1> B_host = B;
+    assert_equal(A, B_host, filename, lineno);
+}
+
+template <typename T, typename Alloc1, typename Alloc2>
+void assert_equal(const THRUST_NS_QUALIFIER::device_vector<T,Alloc1>& A,
+                  const THRUST_NS_QUALIFIER::host_vector<T,Alloc2>& B,
+                  const std::string& filename = "unknown", int lineno = -1)
+{
+    THRUST_NS_QUALIFIER::host_vector<T,Alloc2> A_host = A;
+    assert_equal(A_host, B, filename, lineno);
+}
+
+template <typename T, typename Alloc1, typename Alloc2>
+void assert_equal(const THRUST_NS_QUALIFIER::device_vector<T,Alloc1>& A,
+                  const THRUST_NS_QUALIFIER::device_vector<T,Alloc2>& B,
+                  const std::string& filename = "unknown", int lineno = -1)
+{
+    THRUST_NS_QUALIFIER::host_vector<T> A_host = A;
+    THRUST_NS_QUALIFIER::host_vector<T> B_host = B;
+    assert_equal(A_host, B_host, filename, lineno);
+}
+
+template <typename T, typename Alloc1, typename Alloc2>
+void assert_equal(const THRUST_NS_QUALIFIER::universal_vector<T,Alloc1>& A,
+                  const THRUST_NS_QUALIFIER::universal_vector<T,Alloc2>& B,
                   const std::string& filename = "unknown", int lineno = -1)
 {
     assert_equal(A.begin(), A.end(), B.begin(), B.end(), filename, lineno);
 }
 
 template <typename T, typename Alloc1, typename Alloc2>
-void assert_almost_equal(const thrust::host_vector<T,Alloc1>& A, const thrust::host_vector<T,Alloc2>& B,
-                         const std::string& filename = "unknown", int lineno = -1,
-                         const double a_tol = DEFAULT_ABSOLUTE_TOL, const double r_tol = DEFAULT_RELATIVE_TOL)
+void assert_equal(const THRUST_NS_QUALIFIER::host_vector<T,Alloc1>& A,
+                  const THRUST_NS_QUALIFIER::universal_vector<T,Alloc2>& B,
+                  const std::string& filename = "unknown", int lineno = -1)
 {
-    assert_almost_equal(A.begin(), A.end(), B.begin(), B.end(), filename, lineno, a_tol, r_tol);
+    assert_equal(A.begin(), A.end(), B.begin(), B.end(), filename, lineno);
 }
 
 template <typename T, typename Alloc1, typename Alloc2>
-void assert_equal(const thrust::host_vector<T,Alloc1>& A, const thrust::device_vector<T,Alloc2>& B,
+void assert_equal(const THRUST_NS_QUALIFIER::universal_vector<T,Alloc1>& A,
+                  const THRUST_NS_QUALIFIER::host_vector<T,Alloc2>& B,
                   const std::string& filename = "unknown", int lineno = -1)
 {
-    thrust::host_vector<T,Alloc1> B_host = B;
-    assert_equal(A, B_host, filename, lineno);
+    assert_equal(A.begin(), A.end(), B.begin(), B.end(), filename, lineno);
 }
 
 template <typename T, typename Alloc1, typename Alloc2>
-void assert_equal(const thrust::device_vector<T,Alloc1>& A, const thrust::host_vector<T,Alloc2>& B,
+void assert_equal(const THRUST_NS_QUALIFIER::device_vector<T,Alloc1>& A,
+                  const THRUST_NS_QUALIFIER::universal_vector<T,Alloc2>& B,
                   const std::string& filename = "unknown", int lineno = -1)
 {
-    thrust::host_vector<T,Alloc2> A_host = A;
+    THRUST_NS_QUALIFIER::host_vector<T,Alloc1> A_host = A;
     assert_equal(A_host, B, filename, lineno);
 }
 
 template <typename T, typename Alloc1, typename Alloc2>
-void assert_equal(const thrust::device_vector<T,Alloc1>& A, const thrust::device_vector<T,Alloc2>& B,
+void assert_equal(const THRUST_NS_QUALIFIER::universal_vector<T,Alloc1>& A,
+                  const THRUST_NS_QUALIFIER::device_vector<T,Alloc2>& B,
                   const std::string& filename = "unknown", int lineno = -1)
 {
-    thrust::host_vector<T> A_host = A;
-    thrust::host_vector<T> B_host = B;
-    assert_equal(A_host, B_host, filename, lineno);
+    THRUST_NS_QUALIFIER::host_vector<T,Alloc1> B_host = B;
+    assert_equal(A, B_host, filename, lineno);
+}
+
+template <typename T, typename Alloc1, typename Alloc2>
+void assert_equal(const std::vector<T,Alloc1>& A, const std::vector<T,Alloc2>& B,
+                  const std::string& filename = "unknown", int lineno = -1)
+{
+    assert_equal(A.begin(), A.end(), B.begin(), B.end(), filename, lineno);
+}
+
+template <typename T, typename Alloc1, typename Alloc2>
+void assert_almost_equal(const THRUST_NS_QUALIFIER::host_vector<T,Alloc1>& A,
+                         const THRUST_NS_QUALIFIER::host_vector<T,Alloc2>& B,
+                         const std::string& filename = "unknown", int lineno = -1,
+                         const double a_tol = DEFAULT_ABSOLUTE_TOL, const double r_tol = DEFAULT_RELATIVE_TOL)
+{
+    assert_almost_equal(A.begin(), A.end(), B.begin(), B.end(), filename, lineno, a_tol, r_tol);
 }
 
 template <typename T, typename Alloc1, typename Alloc2>
-void assert_almost_equal(const thrust::host_vector<T,Alloc1>& A, const thrust::device_vector<T,Alloc2>& B,
+void assert_almost_equal(const THRUST_NS_QUALIFIER::host_vector<T,Alloc1>& A,
+                         const THRUST_NS_QUALIFIER::device_vector<T,Alloc2>& B,
                          const std::string& filename = "unknown", int lineno = -1,
                          const double a_tol = DEFAULT_ABSOLUTE_TOL, const double r_tol = DEFAULT_RELATIVE_TOL)
 {
-    thrust::host_vector<T,Alloc1> B_host = B;
+    THRUST_NS_QUALIFIER::host_vector<T,Alloc1> B_host = B;
     assert_almost_equal(A, B_host, filename, lineno, a_tol, r_tol);
 }
 
 template <typename T, typename Alloc1, typename Alloc2>
-void assert_almost_equal(const thrust::device_vector<T,Alloc1>& A, const thrust::host_vector<T,Alloc2>& B,
+void assert_almost_equal(const THRUST_NS_QUALIFIER::device_vector<T,Alloc1>& A,
+                         const THRUST_NS_QUALIFIER::host_vector<T,Alloc2>& B,
                          const std::string& filename = "unknown", int lineno = -1,
                          const double a_tol = DEFAULT_ABSOLUTE_TOL, const double r_tol = DEFAULT_RELATIVE_TOL)
 {
-    thrust::host_vector<T,Alloc2> A_host = A;
+    THRUST_NS_QUALIFIER::host_vector<T,Alloc2> A_host = A;
     assert_almost_equal(A_host, B, filename, lineno, a_tol, r_tol);
 }
 
 template <typename T, typename Alloc1, typename Alloc2>
-void assert_almost_equal(const thrust::device_vector<T,Alloc1>& A, const thrust::device_vector<T,Alloc2>& B,
+void assert_almost_equal(const THRUST_NS_QUALIFIER::device_vector<T,Alloc1>& A,
+                         const THRUST_NS_QUALIFIER::device_vector<T,Alloc2>& B,
                          const std::string& filename = "unknown", int lineno = -1,
                          const double a_tol = DEFAULT_ABSOLUTE_TOL, const double r_tol = DEFAULT_RELATIVE_TOL)
 {
-    thrust::host_vector<T> A_host = A;
-    thrust::host_vector<T> B_host = B;
+    THRUST_NS_QUALIFIER::host_vector<T> A_host = A;
+    THRUST_NS_QUALIFIER::host_vector<T> B_host = B;
     assert_almost_equal(A_host, B_host, filename, lineno, a_tol, r_tol);
 }
 
+template <typename T, typename Alloc1, typename Alloc2>
+void assert_almost_equal(const THRUST_NS_QUALIFIER::universal_vector<T,Alloc1>& A,
+                         const THRUST_NS_QUALIFIER::universal_vector<T,Alloc2>& B,
+                         const std::string& filename = "unknown", int lineno = -1,
+                         const double a_tol = DEFAULT_ABSOLUTE_TOL, const double r_tol = DEFAULT_RELATIVE_TOL)
+{
+    assert_almost_equal(A.begin(), A.end(), B.begin(), B.end(), filename, lineno, a_tol, r_tol);
+}
+
+template <typename T, typename Alloc1, typename Alloc2>
+void assert_almost_equal(const THRUST_NS_QUALIFIER::host_vector<T,Alloc1>& A,
+                         const THRUST_NS_QUALIFIER::universal_vector<T,Alloc2>& B,
+                         const std::string& filename = "unknown", int lineno = -1,
+                         const double a_tol = DEFAULT_ABSOLUTE_TOL, const double r_tol = DEFAULT_RELATIVE_TOL)
+{
+    assert_almost_equal(A.begin(), A.end(), B.begin(), B.end(), filename, lineno, a_tol, r_tol);
+}
+
+template <typename T, typename Alloc1, typename Alloc2>
+void assert_almost_equal(const THRUST_NS_QUALIFIER::universal_vector<T,Alloc1>& A,
+                         const THRUST_NS_QUALIFIER::host_vector<T,Alloc2>& B,
+                         const std::string& filename = "unknown", int lineno = -1,
+                         const double a_tol = DEFAULT_ABSOLUTE_TOL, const double r_tol = DEFAULT_RELATIVE_TOL)
+{
+    assert_almost_equal(A.begin(), A.end(), B.begin(), B.end(), filename, lineno, a_tol, r_tol);
+}
+
+template <typename T, typename Alloc1, typename Alloc2>
+void assert_almost_equal(const THRUST_NS_QUALIFIER::device_vector<T,Alloc1>& A,
+                         const THRUST_NS_QUALIFIER::universal_vector<T,Alloc2>& B,
+                         const std::string& filename = "unknown", int lineno = -1,
+                         const double a_tol = DEFAULT_ABSOLUTE_TOL, const double r_tol = DEFAULT_RELATIVE_TOL)
+{
+    THRUST_NS_QUALIFIER::host_vector<T,Alloc1> A_host = A;
+    assert_almost_equal(A_host, B, filename, lineno, a_tol, r_tol);
+}
+
+template <typename T, typename Alloc1, typename Alloc2>
+void assert_almost_equal(const THRUST_NS_QUALIFIER::universal_vector<T,Alloc1>& A,
+                         const THRUST_NS_QUALIFIER::device_vector<T,Alloc2>& B,
+                         const std::string& filename = "unknown", int lineno = -1,
+                         const double a_tol = DEFAULT_ABSOLUTE_TOL, const double r_tol = DEFAULT_RELATIVE_TOL)
+{
+    THRUST_NS_QUALIFIER::host_vector<T,Alloc1> B_host = B;
+    assert_almost_equal(A, B_host, filename, lineno, a_tol, r_tol);
+}
+
+template <typename T, typename Alloc1, typename Alloc2>
+void assert_almost_equal(const std::vector<T,Alloc1>& A, const std::vector<T,Alloc2>& B,
+                         const std::string& filename = "unknown", int lineno = -1,
+                         const double a_tol = DEFAULT_ABSOLUTE_TOL, const double r_tol = DEFAULT_RELATIVE_TOL)
+{
+    assert_almost_equal(A.begin(), A.end(), B.begin(), B.end(), filename, lineno, a_tol, r_tol);
+}
+
 enum threw_status
 {
   did_not_throw
diff --git a/testing/unittest/cuda/testframework.cu b/testing/unittest/cuda/testframework.cu
index 8f2073157..ff30f368c 100644
--- a/testing/unittest/cuda/testframework.cu
+++ b/testing/unittest/cuda/testframework.cu
@@ -2,6 +2,7 @@
 #include <unittest/cuda/testframework.h>
 #include <thrust/system/cuda/memory.h>
 #include <cuda_runtime.h>
+#include <numeric>
 
 __global__ void dummy_kernel() {}
 
@@ -28,15 +29,15 @@ void list_devices(void)
   {
     std::cout << "There is no device supporting CUDA" << std::endl;
   }
-  
+
   int selected_device;
   cudaGetDevice(&selected_device);
-  
+
   for (int dev = 0; dev < deviceCount; ++dev)
   {
     cudaDeviceProp deviceProp;
     cudaGetDeviceProperties(&deviceProp, dev);
-    
+
     if(dev == 0)
     {
       if(deviceProp.major == 9999 && deviceProp.minor == 9999)
@@ -46,12 +47,12 @@ void list_devices(void)
       else
         std::cout << "There are " << deviceCount <<  " devices supporting CUDA" << std:: endl;
     }
-    
+
     std::cout << "\nDevice " << dev << ": \"" << deviceProp.name << "\"";
     if(dev == selected_device)
       std::cout << "  [SELECTED]";
     std::cout << std::endl;
-    
+
     std::cout << "  Major revision number:                         " << deviceProp.major << std::endl;
     std::cout << "  Minor revision number:                         " << deviceProp.minor << std::endl;
     std::cout << "  Total amount of global memory:                 " << deviceProp.totalGlobalMem << " bytes" << std::endl;
@@ -69,27 +70,25 @@ template<typename Iterator> Iterator my_next(Iterator iter)
 std::vector<int> CUDATestDriver::target_devices(const ArgumentMap &kwargs)
 {
   std::vector<int> result;
-  
+
   // by default, test all devices in the system (device id -1)
   int device_id = kwargs.count("device") ? atoi(kwargs.find("device")->second.c_str()) : -1;
-  
+
   if(device_id < 0)
   {
     // target all devices in the system
     int count = 0;
     cudaGetDeviceCount(&count);
-    
+
     result.resize(count);
-    // XXX iota is not available in c++03
-    for(int i = 0; i < count; ++i)
-      result[i] = i;
+    std::iota(result.begin(), result.end(), 0);
   }
   else
   {
     // target the specified device
     result = std::vector<int>(1,device_id);
   }
-  
+
   return result;
 }
 
@@ -106,12 +105,12 @@ bool CUDATestDriver::check_cuda_error(bool concise)
                 << std::string(cudaGetErrorString(error))
                 << "]" << std::endl;
     }
-  } 
+  }
 
   return cudaSuccess != error;
 }
 
-bool CUDATestDriver::post_test_sanity_check(const UnitTest &test, bool concise)
+bool CUDATestDriver::post_test_smoke_check(const UnitTest &test, bool concise)
 {
   cudaError_t const error = cudaDeviceSynchronize();
   if(cudaSuccess != error)
@@ -128,7 +127,7 @@ bool CUDATestDriver::post_test_sanity_check(const UnitTest &test, bool concise)
 
   return cudaSuccess == error;
 }
-  
+
 bool CUDATestDriver::run_tests(const ArgumentSet &args, const ArgumentMap &kwargs)
 {
   bool verbose = kwargs.count("verbose");
@@ -138,22 +137,21 @@ bool CUDATestDriver::run_tests(const ArgumentSet &args, const ArgumentMap &kwarg
   {
     std::cout << "--verbose and --concise cannot be used together" << std::endl;
     exit(EXIT_FAILURE);
-    return false;
   }
 
   // check error status before doing anything
   if(check_cuda_error(concise)) return false;
-  
+
   bool result = true;
 
   if(kwargs.count("verbose"))
   {
     list_devices();
   }
-  
+
   // figure out which devices to target
   std::vector<int> devices = target_devices(kwargs);
-  
+
   // target each device
   for(std::vector<int>::iterator device = devices.begin();
       device != devices.end();
@@ -171,7 +169,7 @@ bool CUDATestDriver::run_tests(const ArgumentSet &args, const ArgumentMap &kwarg
       // note which device we're skipping
       cudaDeviceProp deviceProp;
       cudaGetDeviceProperties(&deviceProp, *device);
-      
+
       std::cout << "Skipping Device " << *device << ": \"" << deviceProp.name << "\"" << std::endl;
 
       continue;
@@ -182,23 +180,23 @@ bool CUDATestDriver::run_tests(const ArgumentSet &args, const ArgumentMap &kwarg
       // note which device we're testing
       cudaDeviceProp deviceProp;
       cudaGetDeviceProperties(&deviceProp, *device);
-      
+
       std::cout << "Testing Device " << *device << ": \"" << deviceProp.name << "\"" << std::endl;
     }
 
     // check error status before running any tests
     if(check_cuda_error(concise)) return false;
-    
+
     // run tests
     result &= UnitTestDriver::run_tests(args, kwargs);
-    
+
     if(!concise && my_next(device) != devices.end())
     {
       // provide some separation between the output of separate tests
       std::cout << std::endl;
     }
   }
-  
+
   return result;
 }
 
diff --git a/testing/unittest/cuda/testframework.h b/testing/unittest/cuda/testframework.h
index 953f88c1c..34a3dce5a 100644
--- a/testing/unittest/cuda/testframework.h
+++ b/testing/unittest/cuda/testframework.h
@@ -16,7 +16,7 @@ class CUDATestDriver
 
     bool check_cuda_error(bool concise);
 
-    virtual bool post_test_sanity_check(const UnitTest &test, bool concise);
+    virtual bool post_test_smoke_check(const UnitTest &test, bool concise);
 
     virtual bool run_tests(const ArgumentSet &args, const ArgumentMap &kwargs);
 };
diff --git a/testing/unittest/meta.h b/testing/unittest/meta.h
index 39c62edb6..ed492634b 100644
--- a/testing/unittest/meta.h
+++ b/testing/unittest/meta.h
@@ -13,49 +13,10 @@ namespace unittest
 struct null_type {}; 
 
 // this type encapsulates a list of
-// up to 10 types
-template<typename T0 = null_type,
-         typename T1 = null_type,
-         typename T2 = null_type,
-         typename T3 = null_type,
-         typename T4 = null_type,
-         typename T5 = null_type,
-         typename T6 = null_type,
-         typename T7 = null_type,
-         typename T8 = null_type,
-         typename T9 = null_type,
-         typename T10 = null_type,
-         typename T11 = null_type,
-         typename T12 = null_type,
-         typename T13 = null_type,
-         typename T14 = null_type,
-         typename T15 = null_type,
-         typename T16 = null_type,
-         typename T17 = null_type,
-         typename T18 = null_type,
-         typename T19 = null_type>
+// types
+template<typename... Ts>
   struct type_list
 {
-  typedef T0 type_0;
-  typedef T1 type_1;
-  typedef T2 type_2;
-  typedef T3 type_3;
-  typedef T4 type_4;
-  typedef T5 type_5;
-  typedef T6 type_6;
-  typedef T7 type_7;
-  typedef T8 type_8;
-  typedef T9 type_9;
-  typedef T10 type_10;
-  typedef T11 type_11;
-  typedef T12 type_12;
-  typedef T13 type_13;
-  typedef T14 type_14;
-  typedef T15 type_15;
-  typedef T16 type_16;
-  typedef T17 type_17;
-  typedef T18 type_18;
-  typedef T19 type_19;
 };
 
 // this type provides a way of indexing
@@ -66,26 +27,17 @@ template<typename List, unsigned int i>
   typedef null_type type;
 };
 
-template<typename List>  struct get_type<List,0> { typedef typename List::type_0 type; };
-template<typename List>  struct get_type<List,1> { typedef typename List::type_1 type; };
-template<typename List>  struct get_type<List,2> { typedef typename List::type_2 type; };
-template<typename List>  struct get_type<List,3> { typedef typename List::type_3 type; };
-template<typename List>  struct get_type<List,4> { typedef typename List::type_4 type; };
-template<typename List>  struct get_type<List,5> { typedef typename List::type_5 type; };
-template<typename List>  struct get_type<List,6> { typedef typename List::type_6 type; };
-template<typename List>  struct get_type<List,7> { typedef typename List::type_7 type; };
-template<typename List>  struct get_type<List,8> { typedef typename List::type_8 type; };
-template<typename List>  struct get_type<List,9> { typedef typename List::type_9 type; };
-template<typename List>  struct get_type<List,10> { typedef typename List::type_10 type; };
-template<typename List>  struct get_type<List,11> { typedef typename List::type_11 type; };
-template<typename List>  struct get_type<List,12> { typedef typename List::type_12 type; };
-template<typename List>  struct get_type<List,13> { typedef typename List::type_13 type; };
-template<typename List>  struct get_type<List,14> { typedef typename List::type_14 type; };
-template<typename List>  struct get_type<List,15> { typedef typename List::type_15 type; };
-template<typename List>  struct get_type<List,16> { typedef typename List::type_16 type; };
-template<typename List>  struct get_type<List,17> { typedef typename List::type_17 type; };
-template<typename List>  struct get_type<List,18> { typedef typename List::type_18 type; };
-template<typename List>  struct get_type<List,19> { typedef typename List::type_19 type; };
+template<typename T, typename... Ts>
+  struct get_type<type_list<T, Ts...>, 0>
+{
+  typedef T type;
+};
+
+template<typename T, typename... Ts, unsigned int i>
+  struct get_type<type_list<T, Ts...>, i>
+{
+  typedef typename get_type<type_list<Ts...>, i - 1>::type type;
+};
 
 // this type and its specialization provides a way to
 // iterate over a type_list, and
@@ -196,64 +148,26 @@ template<template <typename,typename> class Template>
 // the Type_list's types
 template<typename TypeList,
          template <typename> class Template>
-  struct transform1
-{
-  typedef typename ApplyTemplate1<Template, typename get_type<TypeList,0>::type>::type type_0;
-  typedef typename ApplyTemplate1<Template, typename get_type<TypeList,1>::type>::type type_1;
-  typedef typename ApplyTemplate1<Template, typename get_type<TypeList,2>::type>::type type_2;
-  typedef typename ApplyTemplate1<Template, typename get_type<TypeList,3>::type>::type type_3;
-  typedef typename ApplyTemplate1<Template, typename get_type<TypeList,4>::type>::type type_4;
-  typedef typename ApplyTemplate1<Template, typename get_type<TypeList,5>::type>::type type_5;
-  typedef typename ApplyTemplate1<Template, typename get_type<TypeList,6>::type>::type type_6;
-  typedef typename ApplyTemplate1<Template, typename get_type<TypeList,7>::type>::type type_7;
-  typedef typename ApplyTemplate1<Template, typename get_type<TypeList,8>::type>::type type_8;
-  typedef typename ApplyTemplate1<Template, typename get_type<TypeList,9>::type>::type type_9;
-  typedef typename ApplyTemplate1<Template, typename get_type<TypeList,10>::type>::type type_10;
-  typedef typename ApplyTemplate1<Template, typename get_type<TypeList,11>::type>::type type_11;
-  typedef typename ApplyTemplate1<Template, typename get_type<TypeList,12>::type>::type type_12;
-  typedef typename ApplyTemplate1<Template, typename get_type<TypeList,13>::type>::type type_13;
-  typedef typename ApplyTemplate1<Template, typename get_type<TypeList,14>::type>::type type_14;
-  typedef typename ApplyTemplate1<Template, typename get_type<TypeList,15>::type>::type type_15;
-  typedef typename ApplyTemplate1<Template, typename get_type<TypeList,16>::type>::type type_16;
-  typedef typename ApplyTemplate1<Template, typename get_type<TypeList,17>::type>::type type_17;
-  typedef typename ApplyTemplate1<Template, typename get_type<TypeList,18>::type>::type type_18;
-  typedef typename ApplyTemplate1<Template, typename get_type<TypeList,19>::type>::type type_19;
+  struct transform1;
 
-  typedef type_list<type_0, type_1, type_2, type_3, type_4, type_5, type_6, type_7, type_8, type_9,
-                    type_10, type_11, type_12, type_13, type_14, type_15, type_16, type_17, type_18, type_19> type;
+template<typename... Ts,
+         template <typename> class Template>
+  struct transform1<type_list<Ts...>, Template>
+{
+  typedef type_list<typename ApplyTemplate1<Template, Ts>::type...> type;
 };
 
-// this type creates a new type_list by applying a Template to each of
-// two type_list's types
 template<typename TypeList1,
          typename TypeList2,
          template <typename,typename> class Template>
-  struct transform2
-{
-  typedef typename ApplyTemplate2<Template, typename get_type<TypeList1,0>::type, typename get_type<TypeList2,0>::type>::type type_0;
-  typedef typename ApplyTemplate2<Template, typename get_type<TypeList1,1>::type, typename get_type<TypeList2,1>::type>::type type_1;
-  typedef typename ApplyTemplate2<Template, typename get_type<TypeList1,2>::type, typename get_type<TypeList2,2>::type>::type type_2;
-  typedef typename ApplyTemplate2<Template, typename get_type<TypeList1,3>::type, typename get_type<TypeList2,3>::type>::type type_3;
-  typedef typename ApplyTemplate2<Template, typename get_type<TypeList1,4>::type, typename get_type<TypeList2,4>::type>::type type_4;
-  typedef typename ApplyTemplate2<Template, typename get_type<TypeList1,5>::type, typename get_type<TypeList2,5>::type>::type type_5;
-  typedef typename ApplyTemplate2<Template, typename get_type<TypeList1,6>::type, typename get_type<TypeList2,6>::type>::type type_6;
-  typedef typename ApplyTemplate2<Template, typename get_type<TypeList1,7>::type, typename get_type<TypeList2,7>::type>::type type_7;
-  typedef typename ApplyTemplate2<Template, typename get_type<TypeList1,8>::type, typename get_type<TypeList2,8>::type>::type type_8;
-  typedef typename ApplyTemplate2<Template, typename get_type<TypeList1,9>::type, typename get_type<TypeList2,9>::type>::type type_9;
-  typedef typename ApplyTemplate2<Template, typename get_type<TypeList1,10>::type, typename get_type<TypeList2,10>::type>::type type_10;
-  typedef typename ApplyTemplate2<Template, typename get_type<TypeList1,11>::type, typename get_type<TypeList2,11>::type>::type type_11;
-  typedef typename ApplyTemplate2<Template, typename get_type<TypeList1,12>::type, typename get_type<TypeList2,12>::type>::type type_12;
-  typedef typename ApplyTemplate2<Template, typename get_type<TypeList1,13>::type, typename get_type<TypeList2,13>::type>::type type_13;
-  typedef typename ApplyTemplate2<Template, typename get_type<TypeList1,14>::type, typename get_type<TypeList2,14>::type>::type type_14;
-  typedef typename ApplyTemplate2<Template, typename get_type<TypeList1,15>::type, typename get_type<TypeList2,15>::type>::type type_15;
-  typedef typename ApplyTemplate2<Template, typename get_type<TypeList1,16>::type, typename get_type<TypeList2,16>::type>::type type_16;
-  typedef typename ApplyTemplate2<Template, typename get_type<TypeList1,17>::type, typename get_type<TypeList2,17>::type>::type type_17;
-  typedef typename ApplyTemplate2<Template, typename get_type<TypeList1,18>::type, typename get_type<TypeList2,18>::type>::type type_18;
-  typedef typename ApplyTemplate2<Template, typename get_type<TypeList1,19>::type, typename get_type<TypeList2,19>::type>::type type_19;
-  
+  struct transform2;
 
-  typedef type_list<type_0, type_1, type_2, type_3, type_4, type_5, type_6, type_7, type_8, type_9,
-                    type_10, type_11, type_12, type_13, type_14, type_15, type_16, type_17, type_18, type_19> type;
+template<typename... T1s,
+         typename... T2s,
+         template <typename,typename> class Template>
+  struct transform2<type_list<T1s...>, type_list<T2s...>, Template>
+{
+  typedef type_list<typename ApplyTemplate2<Template, T1s, T2s>::type...> type;
 };
 
 } // end unittest
diff --git a/testing/unittest/random.h b/testing/unittest/random.h
index 924c0f0e1..c94c3fecb 100644
--- a/testing/unittest/random.h
+++ b/testing/unittest/random.h
@@ -25,14 +25,14 @@ template<typename T, typename = void>
 
 template<typename T>
   struct generate_random_integer<T,
-    typename thrust::detail::disable_if<
-      thrust::detail::is_non_bool_arithmetic<T>::value
+    typename THRUST_NS_QUALIFIER::detail::disable_if<
+      THRUST_NS_QUALIFIER::detail::is_non_bool_arithmetic<T>::value
     >::type
   >
 {
   T operator()(unsigned int i) const
   {
-      thrust::default_random_engine rng(hash(i));
+      THRUST_NS_QUALIFIER::default_random_engine rng(hash(i));
 
       return static_cast<T>(rng());
   }
@@ -40,15 +40,15 @@ template<typename T>
 
 template<typename T>
   struct generate_random_integer<T,
-    typename thrust::detail::enable_if<
-      thrust::detail::is_non_bool_integral<T>::value
+    typename THRUST_NS_QUALIFIER::detail::enable_if<
+      THRUST_NS_QUALIFIER::detail::is_non_bool_integral<T>::value
     >::type
   >
 {
   T operator()(unsigned int i) const
   {
-      thrust::default_random_engine rng(hash(i));
-      thrust::uniform_int_distribution<T> dist;
+      THRUST_NS_QUALIFIER::default_random_engine rng(hash(i));
+      THRUST_NS_QUALIFIER::uniform_int_distribution<T> dist;
 
       return static_cast<T>(dist(rng));
   }
@@ -56,8 +56,8 @@ template<typename T>
 
 template<typename T>
   struct generate_random_integer<T,
-    typename thrust::detail::enable_if<
-      thrust::detail::is_floating_point<T>::value
+    typename THRUST_NS_QUALIFIER::detail::enable_if<
+      THRUST_NS_QUALIFIER::detail::is_floating_point<T>::value
     >::type
   >
 {
@@ -66,8 +66,8 @@ template<typename T>
       T const min = std::numeric_limits<T>::min();
       T const max = std::numeric_limits<T>::max();
 
-      thrust::default_random_engine rng(hash(i));
-      thrust::uniform_real_distribution<T> dist(min, max);
+      THRUST_NS_QUALIFIER::default_random_engine rng(hash(i));
+      THRUST_NS_QUALIFIER::uniform_real_distribution<T> dist(min, max);
 
       return static_cast<T>(dist(rng));
   }
@@ -78,8 +78,8 @@ template<>
 {
   bool operator()(unsigned int i) const
   {
-      thrust::default_random_engine rng(hash(i));
-      thrust::uniform_int_distribution<unsigned int> dist(0,1);
+      THRUST_NS_QUALIFIER::default_random_engine rng(hash(i));
+      THRUST_NS_QUALIFIER::uniform_int_distribution<unsigned int> dist(0,1);
 
       return dist(rng) == 1;
   }
@@ -91,8 +91,8 @@ template<typename T>
 {
   T operator()(unsigned int i) const
   {
-      thrust::default_random_engine rng(hash(i));
-      thrust::uniform_int_distribution<unsigned int> dist(0,20);
+      THRUST_NS_QUALIFIER::default_random_engine rng(hash(i));
+      THRUST_NS_QUALIFIER::uniform_int_distribution<unsigned int> dist(0,20);
 
       return static_cast<T>(dist(rng));
   } 
@@ -101,13 +101,13 @@ template<typename T>
 
 
 template<typename T>
-thrust::host_vector<T> random_integers(const size_t N)
+THRUST_NS_QUALIFIER::host_vector<T> random_integers(const size_t N)
 {
-    thrust::host_vector<T> vec(N);
-    thrust::transform(thrust::counting_iterator<unsigned int>(static_cast<unsigned int>(0)),
-                      thrust::counting_iterator<unsigned int>(static_cast<unsigned int>(N)),
-                      vec.begin(),
-                      generate_random_integer<T>());
+    THRUST_NS_QUALIFIER::host_vector<T> vec(N);
+    THRUST_NS_QUALIFIER::transform(THRUST_NS_QUALIFIER::counting_iterator<unsigned int>(static_cast<unsigned int>(0)),
+                                   THRUST_NS_QUALIFIER::counting_iterator<unsigned int>(static_cast<unsigned int>(N)),
+                                   vec.begin(),
+                                   generate_random_integer<T>());
 
     return vec;
 }
@@ -119,13 +119,13 @@ T random_integer()
 }
 
 template<typename T>
-thrust::host_vector<T> random_samples(const size_t N)
+THRUST_NS_QUALIFIER::host_vector<T> random_samples(const size_t N)
 {
-    thrust::host_vector<T> vec(N);
-    thrust::transform(thrust::counting_iterator<unsigned int>(static_cast<unsigned int>(0)),
-                      thrust::counting_iterator<unsigned int>(static_cast<unsigned int>(N)),
-                      vec.begin(),
-                      generate_random_sample<T>());
+    THRUST_NS_QUALIFIER::host_vector<T> vec(N);
+    THRUST_NS_QUALIFIER::transform(THRUST_NS_QUALIFIER::counting_iterator<unsigned int>(static_cast<unsigned int>(0)),
+                                   THRUST_NS_QUALIFIER::counting_iterator<unsigned int>(static_cast<unsigned int>(N)),
+                                   vec.begin(),
+                                   generate_random_sample<T>());
 
     return vec;
 }
diff --git a/testing/unittest/runtime_static_assert.h b/testing/unittest/runtime_static_assert.h
index 199a90ef3..d53bd3b20 100644
--- a/testing/unittest/runtime_static_assert.h
+++ b/testing/unittest/runtime_static_assert.h
@@ -18,8 +18,11 @@ namespace unittest
 #include <thrust/device_new.h>
 #include <thrust/device_delete.h>
 
+#include <nv/target>
+
 #if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
 
+
 #define ASSERT_STATIC_ASSERT(X) \
     { \
         bool triggered = false; \
@@ -72,6 +75,10 @@ namespace unittest
 
     namespace detail
     {
+#if THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC || \
+    THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_CLANG
+        __attribute__((used))
+#endif
         __device__ static static_assert_exception* device_exception = NULL;
     }
 
@@ -82,11 +89,9 @@ namespace unittest
         {
             static_assert_exception ex(filename, lineno);
 
-#ifdef __CUDA_ARCH__
-            *detail::device_exception = ex;
-#else
-            throw ex;
-#endif
+            NV_IF_TARGET(NV_IS_DEVICE,
+                         (*detail::device_exception = ex;),
+                         (throw ex;));
         }
     }
 }
diff --git a/testing/unittest/special_types.h b/testing/unittest/special_types.h
index b046a96ee..9e4b0b743 100644
--- a/testing/unittest/special_types.h
+++ b/testing/unittest/special_types.h
@@ -128,7 +128,11 @@ void swap(user_swappable &x, user_swappable &y)
   y.was_swapped = false;
 }
 
-class my_system : public thrust::device_execution_policy<my_system>
+// Inheriting from classes in anonymous namespaces is not allowed.
+// The anonymous namespace tests don't use these, so just disable them:
+#ifndef THRUST_USE_ANON_NAMESPACE
+
+class my_system : public THRUST_NS_QUALIFIER::device_execution_policy<my_system>
 {
   public:
     my_system(int)
@@ -163,21 +167,23 @@ class my_system : public thrust::device_execution_policy<my_system>
     my_system();
 };
 
-struct my_tag : thrust::device_execution_policy<my_tag> {};
+struct my_tag : THRUST_NS_QUALIFIER::device_execution_policy<my_tag> {};
+
+#endif // THRUST_USE_ANON_NAMESPACE
 
 namespace unittest
 {
 
 
-using thrust::detail::int8_t;
-using thrust::detail::int16_t;
-using thrust::detail::int32_t;
-using thrust::detail::int64_t;
+using THRUST_NS_QUALIFIER::detail::int8_t;
+using THRUST_NS_QUALIFIER::detail::int16_t;
+using THRUST_NS_QUALIFIER::detail::int32_t;
+using THRUST_NS_QUALIFIER::detail::int64_t;
 
-using thrust::detail::uint8_t;
-using thrust::detail::uint16_t;
-using thrust::detail::uint32_t;
-using thrust::detail::uint64_t;
+using THRUST_NS_QUALIFIER::detail::uint8_t;
+using THRUST_NS_QUALIFIER::detail::uint16_t;
+using THRUST_NS_QUALIFIER::detail::uint32_t;
+using THRUST_NS_QUALIFIER::detail::uint64_t;
 
   
 }
diff --git a/testing/unittest/system.h b/testing/unittest/system.h
index f3602e994..766e732d3 100644
--- a/testing/unittest/system.h
+++ b/testing/unittest/system.h
@@ -12,7 +12,7 @@
 namespace unittest
 {
 
-#ifdef __GNUC__
+#if __GNUC__ && !_NVHPC_CUDA
 inline std::string demangle(const char* name)
 {
   int status = 0;
diff --git a/testing/unittest/testframework.cu b/testing/unittest/testframework.cu
index 26db08a3e..67d970399 100644
--- a/testing/unittest/testframework.cu
+++ b/testing/unittest/testframework.cu
@@ -30,7 +30,7 @@ const size_t standard_test_sizes[] =
   (1 << 26) + 1, (1 << 27) - 1, (1 << 27)
 };
 
-        
+
 const size_t tiny_threshold    = 1 <<  5;  //   32
 const size_t small_threshold   = 1 <<  8;  //  256
 const size_t medium_threshold  = 1 << 12;  //   4K
@@ -110,9 +110,9 @@ void process_args(int argc, char ** argv,
   {
     std::string arg(argv[i]);
 
-    // look for --key or --key=value arguments 
+    // look for --key or --key=value arguments
     if(arg.substr(0,2) == "--")
-    {   
+    {
       std::string::size_type n = arg.find('=',2);
 
       if(n == std::string::npos)
@@ -135,7 +135,7 @@ void process_args(int argc, char ** argv,
 void usage(int /*argc*/, char** argv)
 {
   std::string indent = "  ";
-  
+
   std::cout << "Example Usage:\n";
   std::cout << indent << argv[0] << "\n";
   std::cout << indent << argv[0] << " TestName1 [TestName2 ...] \n";
@@ -164,14 +164,14 @@ struct TestResult
   TestStatus  status;
   std::string name;
   std::string message;
-  
+
   // XXX use a c++11 timer result when available
   std::clock_t elapsed;
-  
+
   TestResult(const TestStatus status, std::clock_t elapsed, const UnitTest& u, const std::string& message = "")
       : status(status), name(u.name), message(message), elapsed(elapsed)
   {}
-  
+
   bool operator<(const TestResult& tr) const
   {
     if(status < tr.status)
@@ -199,20 +199,20 @@ void record_result(const TestResult& test_result, std::vector< TestResult >& tes
 void report_results(std::vector< TestResult >& test_results, double elapsed_minutes)
 {
   std::cout << std::endl;
-  
+
   std::string hline = "================================================================";
-  
+
   std::sort(test_results.begin(), test_results.end());
-  
+
   size_t num_passes = 0;
   size_t num_failures = 0;
   size_t num_known_failures = 0;
   size_t num_errors = 0;
-  
+
   for(size_t i = 0; i < test_results.size(); i++)
   {
     const TestResult& tr = test_results[i];
-    
+
     if(tr.status == Pass)
     {
       num_passes++;
@@ -220,7 +220,7 @@ void report_results(std::vector< TestResult >& test_results, double elapsed_minu
     else
     {
       std::cout << hline << std::endl;
-    
+
       switch(tr.status)
       {
         case Failure:
@@ -232,13 +232,13 @@ void report_results(std::vector< TestResult >& test_results, double elapsed_minu
         default:
           break;
       }
-    
+
       std::cout << ": " << tr.name << std::endl << tr.message << std::endl;
     }
   }
-  
+
   std::cout << hline << std::endl;
-  
+
   std::cout << "Totals: ";
   std::cout << num_failures << " failures, ";
   std::cout << num_known_failures << " known failures, ";
@@ -257,7 +257,7 @@ void UnitTestDriver::list_tests(void)
 }
 
 
-bool UnitTestDriver::post_test_sanity_check(const UnitTest &/*test*/, bool /*concise*/)
+bool UnitTestDriver::post_test_smoke_check(const UnitTest &/*test*/, bool /*concise*/)
 {
   return true;
 }
@@ -266,45 +266,45 @@ bool UnitTestDriver::post_test_sanity_check(const UnitTest &/*test*/, bool /*con
 bool UnitTestDriver::run_tests(std::vector<UnitTest *>& tests_to_run, const ArgumentMap& kwargs)
 {
   std::time_t start_time = std::time(0);
-  
+
   THRUST_DISABLE_MSVC_FORCING_VALUE_TO_BOOL_WARNING_BEGIN
   bool verbose = kwargs.count("verbose");
   bool concise = kwargs.count("concise");
   THRUST_DISABLE_MSVC_FORCING_VALUE_TO_BOOL_WARNING_END
-  
+
   std::vector< TestResult > test_results;
-  
+
   if(verbose && concise)
   {
     std::cout << "--verbose and --concise cannot be used together" << std::endl;
     exit(EXIT_FAILURE);
   }
-  
+
   if(!concise)
   {
     std::cout << "Running " << tests_to_run.size() << " unit tests." << std::endl;
   }
-  
+
   for(size_t i = 0; i < tests_to_run.size(); i++)
   {
      UnitTest& test = *tests_to_run[i];
-  
+
      if(verbose)
      {
        std::cout << "Running " << test.name << "..." << std::flush;
      }
-  
+
      try
      {
        // time the test
        std::clock_t start = std::clock();
-  
+
        // run the test
        test.run();
-  
+
        // test passed
        record_result(TestResult(Pass, std::clock() - start, test), test_results);
-     } 
+     }
      catch(unittest::UnitTestFailure& f)
      {
        record_result(TestResult(Failure, (std::numeric_limits<std::clock_t>::max)(), test, f.message), test_results);
@@ -321,7 +321,7 @@ bool UnitTestDriver::run_tests(std::vector<UnitTest *>& tests_to_run, const Argu
      {
        record_result(TestResult(Error, (std::numeric_limits<std::clock_t>::max)(), test, e.message), test_results);
      }
-  
+
      // immediate report
      if(!concise)
      {
@@ -342,7 +342,7 @@ bool UnitTestDriver::run_tests(std::vector<UnitTest *>& tests_to_run, const Argu
            default:
              break;
          }
-  
+
          std::cout << " " << test.name << std::endl;
        }
        else
@@ -362,24 +362,24 @@ bool UnitTestDriver::run_tests(std::vector<UnitTest *>& tests_to_run, const Argu
          }
        }
      }
-  
-     if(!post_test_sanity_check(test, concise))
+
+     if(!post_test_smoke_check(test, concise))
      {
        return false;
      }
-  
+
      std::cout.flush();
   }
-  
+
   double elapsed_minutes = double(std::time(0) - start_time) / 60;
-  
+
   // summary report
   if(!concise)
   {
     report_results(test_results, elapsed_minutes);
   }
-  
-  
+
+
   // if any failures or errors return false
   for(size_t i = 0; i < test_results.size(); i++)
   {
@@ -388,7 +388,7 @@ bool UnitTestDriver::run_tests(std::vector<UnitTest *>& tests_to_run, const Argu
       return false;
     }
   }
-  
+
   // all tests pass or are known failures
   return true;
 }
@@ -400,35 +400,35 @@ bool UnitTestDriver::run_tests(const ArgumentSet& args, const ArgumentMap& kwarg
   {
     // run all tests
     std::vector<UnitTest *> tests_to_run;
-    
+
     for(TestMap::iterator iter = test_map.begin(); iter != test_map.end(); iter++)
     {
       tests_to_run.push_back(iter->second);
     }
-    
+
     return run_tests(tests_to_run, kwargs);
   }
   else
   {
     // all non-keyword arguments are assumed to be test names or partial test names
-  
+
     typedef TestMap::iterator               TestMapIterator;
-  
+
     // vector to accumulate tests
     std::vector<UnitTest *> tests_to_run;
-  
+
     for(ArgumentSet::const_iterator iter = args.begin(); iter != args.end(); iter++)
     {
       const std::string& arg = *iter;
-  
+
       size_t len = arg.size();
       size_t matches = 0;
-  
+
       if(arg[len-1] == '*')
       {
         // wildcard search
         std::string search = arg.substr(0,len-1);
-  
+
         TestMapIterator lb = test_map.lower_bound(search);
         while(lb != test_map.end())
         {
@@ -436,8 +436,8 @@ bool UnitTestDriver::run_tests(const ArgumentSet& args, const ArgumentMap& kwarg
           {
             break;
           }
-  
-          tests_to_run.push_back(lb->second); 
+
+          tests_to_run.push_back(lb->second);
           lb++;
           matches++;
         }
@@ -446,21 +446,21 @@ bool UnitTestDriver::run_tests(const ArgumentSet& args, const ArgumentMap& kwarg
       {
         // non-wildcard search
         TestMapIterator lb = test_map.find(arg);
-  
+
         if(lb != test_map.end())
         {
-          tests_to_run.push_back(lb->second); 
+          tests_to_run.push_back(lb->second);
           matches++;
         }
       }
-  
+
       if(matches == 0)
       {
         std::cout << "[ERROR] found no test names matching the pattern: " << arg << std::endl;
         return false;
       }
     }
-  
+
     return run_tests(tests_to_run, kwargs);
   }
 }
@@ -487,21 +487,21 @@ int main(int argc, char **argv)
 {
   ArgumentSet args;
   ArgumentMap kwargs;
-  
+
   process_args(argc, argv, args, kwargs);
-  
+
   if(kwargs.count("help"))
   {
     usage(argc, argv);
     return 0;
   }
-  
+
   if(kwargs.count("list"))
   {
     UnitTestDriver::s_driver().list_tests();
     return 0;
   }
-  
+
   if(kwargs.count("sizes"))
   {
     set_test_sizes(kwargs["sizes"]);
@@ -510,14 +510,14 @@ int main(int argc, char **argv)
   {
     set_test_sizes("default");
   }
-  
+
   bool passed = UnitTestDriver::s_driver().run_tests(args, kwargs);
-  
+
   if(kwargs.count("concise"))
   {
     std::cout << ((passed) ? "PASSED" : "FAILED") << std::endl;
   }
-  
+
   return (passed) ? EXIT_SUCCESS : EXIT_FAILURE;
 }
 
diff --git a/testing/unittest/testframework.h b/testing/unittest/testframework.h
index bfeb363dc..c6ced96e7 100644
--- a/testing/unittest/testframework.h
+++ b/testing/unittest/testframework.h
@@ -1,20 +1,22 @@
 #pragma once
 
+#include <cstdio>
+#include <iostream>
+#include <map>
+#include <set>
 #include <string>
+#include <type_traits>
 #include <vector>
-#include <set>
-#include <map>
-#include <iostream>
-
-#include <stdio.h>
 
 #include "meta.h"
 #include "util.h"
 
 #include <thrust/limits.h>
+#include <thrust/detail/config.h>
 #include <thrust/detail/integer_traits.h>
-#include <thrust/memory/detail/device_system_resource.h>
-#include <thrust/memory/detail/host_system_resource.h>
+#include <thrust/mr/host_memory_resource.h>
+#include <thrust/mr/device_memory_resource.h>
+#include <thrust/mr/universal_memory_resource.h>
 #include <thrust/mr/allocator.h>
 
 // define some common lists of types
@@ -78,10 +80,13 @@ class custom_numeric
         fill(0);
     }
 
+    // Allow construction from any integral numeric.
+    template <typename T,
+              typename = typename std::enable_if<std::is_integral<T>::value>::type>
     __host__ __device__
-    custom_numeric(int i)
+    custom_numeric(const T& i)
     {
-        fill(i);
+        fill(static_cast<int>(i));
     }
 
     __host__ __device__
@@ -224,8 +229,7 @@ class custom_numeric
     }
 };
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 template <>
 struct numeric_limits<custom_numeric> : numeric_limits<int> {};
@@ -239,7 +243,9 @@ class integer_traits<custom_numeric>
   : public integer_traits_base<int, INT_MIN, INT_MAX>
 {};
 
-}} // namespace thrust::detail
+} // namespace detail
+
+THRUST_NAMESPACE_END
 
 typedef unittest::type_list<char,
                             signed char,
@@ -285,10 +291,13 @@ inline std::string base_class_name(const std::string& name)
   // if the name begins with "class ", chop it off
   chop_prefix(result, "class ");
 
-  // chop everything including and after first "<"
-  return result.replace(result.find_first_of("<"),
-                        result.size(),
-                        "");
+  const std::size_t first_lt = result.find_first_of("<");
+
+  if (first_lt < result.size())
+      // chop everything including and after first "<"
+      return result.replace(first_lt, result.size(), "");
+  else
+      return result;
 }
 
 enum TestStatus { Pass = 0, Failure = 1, KnownFailure = 2, Error = 3, UnknownException = 4};
@@ -328,7 +337,7 @@ class UnitTestDriver
   // \param test The UnitTest of interest
   // \param concise Whether or not to suppress output
   // \return true if all is well; false if the tests must be immediately aborted
-  virtual bool post_test_sanity_check(const UnitTest &test, bool concise);
+  virtual bool post_test_smoke_check(const UnitTest &test, bool concise);
 
 public:
   inline virtual ~UnitTestDriver() {};
@@ -356,7 +365,7 @@ class NAME##UnitTest : public UnitTest {                         \
     public:                                                      \
     NAME##UnitTest() : UnitTest(#NAME) {}                        \
     void run(){                                                  \
-            TEST();                                              \
+        TEST();                                                  \
     }                                                            \
 };                                                               \
 NAME##UnitTest NAME##Instance
@@ -385,15 +394,16 @@ void VTEST##Device(void) {                                      \
     VTEST< thrust::device_vector<int,                           \
         thrust::mr::stateless_resource_allocator<int,           \
             thrust::device_memory_resource> > >();              \
-    VTEST< thrust::device_vector<int,                           \
-        thrust::mr::stateless_resource_allocator<int,           \
-            thrust::universal_memory_resource> > >();           \
+}                                                               \
+void VTEST##Universal(void) {                                   \
+    VTEST< thrust::universal_vector<int> >();                   \
     VTEST< thrust::device_vector<int,                           \
         thrust::mr::stateless_resource_allocator<int,           \
             thrust::universal_host_pinned_memory_resource> > >();\
 }                                                               \
 DECLARE_UNITTEST(VTEST##Host);                                  \
-DECLARE_UNITTEST(VTEST##Device);
+DECLARE_UNITTEST(VTEST##Device);                                \
+DECLARE_UNITTEST(VTEST##Universal);
 
 // Same as above, but only for integral types
 #define DECLARE_INTEGRAL_VECTOR_UNITTEST(VTEST)                 \
@@ -407,8 +417,15 @@ void VTEST##Device(void) {                                      \
     VTEST< thrust::device_vector<short> >();                    \
     VTEST< thrust::device_vector<int> >();                      \
 }                                                               \
+void VTEST##Universal(void) {                                   \
+    VTEST< thrust::universal_vector<int> >();                   \
+    VTEST< thrust::device_vector<int,                           \
+        thrust::mr::stateless_resource_allocator<int,           \
+            thrust::universal_host_pinned_memory_resource> > >();\
+}                                                               \
 DECLARE_UNITTEST(VTEST##Host);                                  \
-DECLARE_UNITTEST(VTEST##Device);
+DECLARE_UNITTEST(VTEST##Device);                                \
+DECLARE_UNITTEST(VTEST##Universal);
 
 // Macro to create instances of a test for several data types.
 #define DECLARE_GENERIC_UNITTEST(TEST)                           \
@@ -428,6 +445,22 @@ class TEST##UnitTest : public UnitTest {                         \
 };                                                               \
 TEST##UnitTest TEST##Instance
 
+// Macro to create instances of a test for several array sizes.
+#define DECLARE_SIZED_UNITTEST(TEST)                             \
+class TEST##UnitTest : public UnitTest {                         \
+    public:                                                      \
+    TEST##UnitTest() : UnitTest(#TEST) {}                        \
+    void run()                                                   \
+    {                                                            \
+        std::vector<size_t> sizes = get_test_sizes();            \
+        for(size_t i = 0; i != sizes.size(); ++i)                \
+        {                                                        \
+            TEST(sizes[i]);                                      \
+        }                                                        \
+    }                                                            \
+};                                                               \
+TEST##UnitTest TEST##Instance
+
 // Macro to create instances of a test for several data types and array sizes
 #define DECLARE_VARIABLE_UNITTEST(TEST)                          \
 class TEST##UnitTest : public UnitTest {                         \
@@ -524,7 +557,7 @@ template<template <typename> class TestName, typename TypeList>
     {
         std::vector<size_t> sizes = get_test_sizes();
         for(size_t i = 0; i != sizes.size(); ++i)
-        {                                                 
+        {
             // get the first type in the list
             typedef typename unittest::get_type<TypeList,0>::type first_type;
 
@@ -532,7 +565,7 @@ template<template <typename> class TestName, typename TypeList>
 
             // loop over the types
             loop(sizes[i]);
-        }                                                 
+        }
     }
 }; // end VariableUnitTest
 
@@ -544,7 +577,7 @@ template<template <typename> class TestName,
     : public UnitTest
 {
   VectorUnitTest()
-    : UnitTest((base_class_name(unittest::type_name<TestName< Vector<int, Alloc<int> > > >()) + "<" + 
+    : UnitTest((base_class_name(unittest::type_name<TestName< Vector<int, Alloc<int> > > >()) + "<" +
                 base_class_name(unittest::type_name<Vector<int, Alloc<int> > >()) + ">").c_str())
   { }
 
diff --git a/testing/unittest/util.h b/testing/unittest/util.h
index 02c1eb7ce..986f80c7b 100644
--- a/testing/unittest/util.h
+++ b/testing/unittest/util.h
@@ -21,26 +21,26 @@ template<typename T>
 // Use this with counting_iterator to avoid generating a range larger than we
 // can represent.
 template <typename T>
-typename thrust::detail::disable_if<
-  thrust::detail::is_floating_point<T>::value
+typename THRUST_NS_QUALIFIER::detail::disable_if<
+  THRUST_NS_QUALIFIER::detail::is_floating_point<T>::value
 , T
 >::type truncate_to_max_representable(std::size_t n)
 {
-  return thrust::min<std::size_t>(
-    n, static_cast<std::size_t>(thrust::numeric_limits<T>::max())
-  );
+  return static_cast<T>(THRUST_NS_QUALIFIER::min<std::size_t>(
+    n,
+    static_cast<std::size_t>(THRUST_NS_QUALIFIER::numeric_limits<T>::max())));
 }
 
 // TODO: This probably won't work for `half`.
 template <typename T>
-typename thrust::detail::enable_if<
-  thrust::detail::is_floating_point<T>::value
+typename THRUST_NS_QUALIFIER::detail::enable_if<
+  THRUST_NS_QUALIFIER::detail::is_floating_point<T>::value
 , T
 >::type truncate_to_max_representable(std::size_t n)
 {
-  return thrust::min<T>(
-    n, thrust::numeric_limits<T>::max()
-  );
+  return THRUST_NS_QUALIFIER::min<T>(
+    static_cast<T>(n),
+    THRUST_NS_QUALIFIER::numeric_limits<T>::max());
 }
 
 } // end unittest
diff --git a/testing/unittest/util_async.h b/testing/unittest/util_async.h
index 984cc61c6..9a3454efd 100644
--- a/testing/unittest/util_async.h
+++ b/testing/unittest/util_async.h
@@ -1,9 +1,9 @@
 #pragma once
 
 #include <thrust/detail/config.h>
-#include <thrust/detail/cpp11_required.h>
+#include <thrust/detail/cpp14_required.h>
 
-#if THRUST_CPP_DIALECT >= 2011
+#if THRUST_CPP_DIALECT >= 2014
 
 #include <unittest/unittest.h>
 
@@ -73,5 +73,4 @@ auto test_future_value_retrieval(
 
 } // namespace unittest
 
-#endif // THRUST_CPP_DIALECT >= 2011
-
+#endif // THRUST_CPP_DIALECT >= 2014
diff --git a/testing/unittest_static_assert.cmake b/testing/unittest_static_assert.cmake
new file mode 100644
index 000000000..a8a96f2bd
--- /dev/null
+++ b/testing/unittest_static_assert.cmake
@@ -0,0 +1,10 @@
+# Disable unreachable code warnings.
+# This test unconditionally throws in some places, the compiler will detect that
+# control flow will never reach some instructions. This is intentional.
+target_link_libraries(${test_target} PRIVATE thrust.silence_unreachable_code_warnings)
+
+# The machinery behind this test is not compatible with NVC++.
+# See https://github.com/NVIDIA/thrust/issues/1397
+if ("NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}")
+  set_tests_properties(${test_target} PROPERTIES DISABLED True)
+endif()
diff --git a/testing/unittest_static_assert.cu b/testing/unittest_static_assert.cu
index dd5ed659b..7ed0d5658 100644
--- a/testing/unittest_static_assert.cu
+++ b/testing/unittest_static_assert.cu
@@ -12,7 +12,7 @@ template<typename T>
 struct static_assertion
 {
     __host__ __device__
-    int operator()() const
+    T operator()() const
     {
         THRUST_STATIC_ASSERT(dependent_false<T>::value);
         return 0;
@@ -22,9 +22,9 @@ struct static_assertion
 template<typename V>
 void TestStaticAssertAssert()
 {
-#if THRUST_DEVICE_SYSTEM != THRUST_DEVICE_SYSTEM_OMP
+    using value_type = typename V::value_type;
     V test(10);
-    ASSERT_STATIC_ASSERT(thrust::generate(test.begin(), test.end(), static_assertion<int>()));
-#endif
+    ASSERT_STATIC_ASSERT(thrust::generate(test.begin(), test.end(),
+                                          static_assertion<value_type>()));
 }
 DECLARE_VECTOR_UNITTEST(TestStaticAssertAssert);
diff --git a/testing/universal_memory.cu b/testing/universal_memory.cu
new file mode 100644
index 000000000..18a30fbfe
--- /dev/null
+++ b/testing/universal_memory.cu
@@ -0,0 +1,166 @@
+#include <unittest/unittest.h>
+
+#include <thrust/sequence.h>
+#include <thrust/allocate_unique.h>
+#include <thrust/universal_vector.h>
+#include <thrust/type_traits/is_contiguous_iterator.h>
+
+#include <numeric>
+#include <vector>
+
+namespace
+{
+
+// The managed_memory_pointer class should be identified as a
+// contiguous_iterator
+THRUST_STATIC_ASSERT(
+  thrust::is_contiguous_iterator<thrust::universal_allocator<int>::pointer>::value);
+
+template <typename T>
+struct some_object {
+  some_object(T data)
+      : m_data(data)
+  {}
+
+  void setter(T data) { m_data = data; }
+  T getter() const { return m_data; }
+
+private:
+  T m_data;
+};
+
+} // namespace
+
+template <typename T>
+void TestUniversalAllocateUnique()
+{
+  // Simple test to ensure that pointers created with universal_memory_resource
+  // can be dereferenced and used with STL code. This is necessary as some
+  // STL implementations break when using fancy references that overload
+  // operator&, so universal_memory_resource uses a special pointer type that
+  // returns regular C++ references that can be safely used host-side.
+
+  // These operations fail to compile with fancy references:
+  auto raw = thrust::allocate_unique<T>(thrust::universal_allocator<T>{}, 42);
+  auto obj = thrust::allocate_unique<some_object<T>>(
+    thrust::universal_allocator<some_object<T> >{}, 42
+  );
+
+  static_assert(
+    std::is_same<decltype(raw.get()),
+                 thrust::universal_ptr<T> >::value,
+    "Unexpected pointer type returned from std::unique_ptr::get.");
+  static_assert(
+    std::is_same<decltype(obj.get()),
+                 thrust::universal_ptr<some_object<T> > >::value,
+    "Unexpected pointer type returned from std::unique_ptr::get.");
+
+  ASSERT_EQUAL(*raw, T(42));
+  ASSERT_EQUAL(*raw.get(), T(42));
+  ASSERT_EQUAL(obj->getter(), T(42));
+  ASSERT_EQUAL((*obj).getter(), T(42));
+  ASSERT_EQUAL(obj.get()->getter(), T(42));
+  ASSERT_EQUAL((*obj.get()).getter(), T(42));
+}
+DECLARE_GENERIC_UNITTEST(TestUniversalAllocateUnique);
+
+template <typename T>
+void TestUniversalIterationRaw()
+{
+  auto array = thrust::allocate_unique_n<T>(
+    thrust::universal_allocator<T>{}, 6, 42);
+
+  static_assert(
+    std::is_same<decltype(array.get()), thrust::universal_ptr<T> >::value,
+    "Unexpected pointer type returned from std::unique_ptr::get.");
+
+  for (auto iter = array.get(), end = array.get() + 6; iter < end; ++iter)
+  {
+    ASSERT_EQUAL(*iter, T(42));
+    ASSERT_EQUAL(*iter.get(), T(42));
+  }
+}
+DECLARE_GENERIC_UNITTEST(TestUniversalIterationRaw);
+
+template <typename T>
+void TestUniversalIterationObj()
+{
+  auto array = thrust::allocate_unique_n<some_object<T>>(
+    thrust::universal_allocator<some_object<T>>{}, 6, 42);
+
+  static_assert(
+    std::is_same<decltype(array.get()),
+                 thrust::universal_ptr<some_object<T>>>::value,
+    "Unexpected pointer type returned from std::unique_ptr::get.");
+
+  for (auto iter = array.get(), end = array.get() + 6; iter < end; ++iter)
+  {
+    ASSERT_EQUAL(iter->getter(), T(42));
+    ASSERT_EQUAL((*iter).getter(), T(42));
+    ASSERT_EQUAL(iter.get()->getter(), T(42));
+    ASSERT_EQUAL((*iter.get()).getter(), T(42));
+  }
+}
+DECLARE_GENERIC_UNITTEST(TestUniversalIterationObj);
+
+template <typename T>
+void TestUniversalRawPointerCast()
+{
+  auto obj = thrust::allocate_unique<T>(thrust::universal_allocator<T>{}, 42);
+
+  static_assert(
+    std::is_same<decltype(obj.get()), thrust::universal_ptr<T>>::value,
+    "Unexpected pointer type returned from std::unique_ptr::get.");
+
+  static_assert(
+    std::is_same<decltype(thrust::raw_pointer_cast(obj.get())), T*>::value,
+    "Unexpected pointer type returned from thrust::raw_pointer_cast.");
+
+  *thrust::raw_pointer_cast(obj.get()) = T(17);
+
+  ASSERT_EQUAL(*obj, T(17));
+}
+DECLARE_GENERIC_UNITTEST(TestUniversalRawPointerCast);
+
+template <typename T>
+void TestUniversalThrustVector(std::size_t const n)
+{
+  thrust::host_vector<T>      host(n);
+  thrust::universal_vector<T> universal(n);
+
+  static_assert(
+    std::is_same<typename std::decay<decltype(universal)>::type::pointer,
+                 thrust::universal_ptr<T>>::value,
+    "Unexpected thrust::universal_vector pointer type.");
+
+  thrust::sequence(host.begin(), host.end(), 0);
+  thrust::sequence(universal.begin(), universal.end(), 0);
+
+  ASSERT_EQUAL(host.size(), n);
+  ASSERT_EQUAL(universal.size(), n);
+  ASSERT_EQUAL(host, universal);
+}
+DECLARE_VARIABLE_UNITTEST(TestUniversalThrustVector);
+
+// Verify that a std::vector using the universal allocator will work with
+// Standard Library algorithms.
+template <typename T>
+void TestUniversalStdVector(std::size_t const n)
+{
+  std::vector<T>                                 host(n);
+  std::vector<T, thrust::universal_allocator<T>> universal(n);
+
+  static_assert(
+    std::is_same<typename std::decay<decltype(universal)>::type::pointer,
+                 thrust::universal_ptr<T>>::value,
+    "Unexpected std::vector pointer type.");
+
+  std::iota(host.begin(), host.end(), 0);
+  std::iota(universal.begin(), universal.end(), 0);
+
+  ASSERT_EQUAL(host.size(), n);
+  ASSERT_EQUAL(universal.size(), n);
+  ASSERT_EQUAL(host, universal);
+}
+DECLARE_VARIABLE_UNITTEST(TestUniversalStdVector);
+
diff --git a/testing/vector.cu b/testing/vector.cu
index f88ef0a4f..b09a4b55c 100644
--- a/testing/vector.cu
+++ b/testing/vector.cu
@@ -1,6 +1,12 @@
 #include <unittest/unittest.h>
+
+#include <thrust/detail/config.h>
 #include <thrust/sequence.h>
 #include <thrust/device_malloc_allocator.h>
+
+#if THRUST_CPP_DIALECT >= 2011
+#include <initializer_list>
+#endif
 #include <vector>
 #include <list>
 #include <limits>
@@ -34,6 +40,30 @@ void TestVectorBool(void)
 }
 DECLARE_UNITTEST(TestVectorBool);
 
+template <class Vector>
+void TestVectorInitializerList(void)
+{
+    Vector v{1, 2, 3};
+    ASSERT_EQUAL(v.size(), 3lu);
+    ASSERT_EQUAL(v[0], 1);
+    ASSERT_EQUAL(v[1], 2);
+    ASSERT_EQUAL(v[2], 3);
+
+    v = {1, 2, 3, 4};
+    ASSERT_EQUAL(v.size(), 4lu);
+    ASSERT_EQUAL(v[0], 1);
+    ASSERT_EQUAL(v[1], 2);
+    ASSERT_EQUAL(v[2], 3);
+    ASSERT_EQUAL(v[3], 4);
+    
+    const auto alloc = v.get_allocator();
+    Vector v2{{1, 2, 3}, alloc};
+    ASSERT_EQUAL(v2.size(), 3lu);
+    ASSERT_EQUAL(v2[0], 1);
+    ASSERT_EQUAL(v2[1], 2);
+    ASSERT_EQUAL(v2[2], 3);
+}
+DECLARE_VECTOR_UNITTEST(TestVectorInitializerList);
 
 template <class Vector>
 void TestVectorFrontBack(void)
@@ -52,24 +82,27 @@ DECLARE_VECTOR_UNITTEST(TestVectorFrontBack);
 template <class Vector>
 void TestVectorData(void)
 {
+    typedef typename Vector::pointer PointerT;
+    typedef typename Vector::const_pointer PointerConstT;
+
     Vector v(3);
     v[0] = 0; v[1] = 1; v[2] = 2;
 
     ASSERT_EQUAL(0,          *v.data());
     ASSERT_EQUAL(1,          *(v.data() + 1));
     ASSERT_EQUAL(2,          *(v.data() + 2));
-    ASSERT_EQUAL(&v.front(),  v.data());
-    ASSERT_EQUAL(&*v.begin(), v.data());
-    ASSERT_EQUAL(&v[0],       v.data());
+    ASSERT_EQUAL(PointerT(&v.front()),  v.data());
+    ASSERT_EQUAL(PointerT(&*v.begin()), v.data());
+    ASSERT_EQUAL(PointerT(&v[0]),       v.data());
 
     const Vector &c_v = v;
 
     ASSERT_EQUAL(0,            *c_v.data());
     ASSERT_EQUAL(1,            *(c_v.data() + 1));
     ASSERT_EQUAL(2,            *(c_v.data() + 2));
-    ASSERT_EQUAL(&c_v.front(),  c_v.data());
-    ASSERT_EQUAL(&*c_v.begin(), c_v.data());
-    ASSERT_EQUAL(&c_v[0],       c_v.data());
+    ASSERT_EQUAL(PointerConstT(&c_v.front()),  c_v.data());
+    ASSERT_EQUAL(PointerConstT(&*c_v.begin()), c_v.data());
+    ASSERT_EQUAL(PointerConstT(&c_v[0]),       c_v.data());
 }
 DECLARE_VECTOR_UNITTEST(TestVectorData);
 
@@ -119,7 +152,7 @@ void TestVectorFromSTLVector(void)
     ASSERT_EQUAL(v[2], 2);
 
     v = stl_vector;
-    
+
     ASSERT_EQUAL(v.size(), 3lu);
     ASSERT_EQUAL(v[0], 0);
     ASSERT_EQUAL(v[1], 1);
@@ -169,7 +202,7 @@ template <class Vector>
 void TestVectorFromBiDirectionalIterator(void)
 {
     typedef typename Vector::value_type T;
-    
+
     std::list<T> stl_list;
     stl_list.push_back(0);
     stl_list.push_back(1);
@@ -189,7 +222,7 @@ template <class Vector>
 void TestVectorAssignFromBiDirectionalIterator(void)
 {
     typedef typename Vector::value_type T;
-    
+
     std::list<T> stl_list;
     stl_list.push_back(0);
     stl_list.push_back(1);
@@ -246,7 +279,7 @@ void TestVectorToAndFromHostVector(void)
     v[1] = 11;
     v[2] = 12;
 
-    ASSERT_EQUAL(h[0], 0);  ASSERT_EQUAL(v[0], 10); 
+    ASSERT_EQUAL(h[0], 0);  ASSERT_EQUAL(v[0], 10);
     ASSERT_EQUAL(h[1], 1);  ASSERT_EQUAL(v[1], 11);
     ASSERT_EQUAL(h[2], 2);  ASSERT_EQUAL(v[2], 12);
 
@@ -303,7 +336,7 @@ void TestVectorToAndFromDeviceVector(void)
     v[1] = 11;
     v[2] = 12;
 
-    ASSERT_EQUAL(h[0], 0);  ASSERT_EQUAL(v[0], 10); 
+    ASSERT_EQUAL(h[0], 0);  ASSERT_EQUAL(v[0], 10);
     ASSERT_EQUAL(h[1], 1);  ASSERT_EQUAL(v[1], 11);
     ASSERT_EQUAL(h[2], 2);  ASSERT_EQUAL(v[2], 12);
 
@@ -348,7 +381,7 @@ void TestVectorSwap(void)
 
     v.swap(u);
 
-    ASSERT_EQUAL(v[0], 10); ASSERT_EQUAL(u[0], 0);  
+    ASSERT_EQUAL(v[0], 10); ASSERT_EQUAL(u[0], 0);
     ASSERT_EQUAL(v[1], 11); ASSERT_EQUAL(u[1], 1);
     ASSERT_EQUAL(v[2], 12); ASSERT_EQUAL(u[2], 2);
 }
@@ -363,33 +396,33 @@ void TestVectorErasePosition(void)
 
     v.erase(v.begin() + 2);
 
-    ASSERT_EQUAL(v.size(), 4lu); 
-    ASSERT_EQUAL(v[0], 0); 
-    ASSERT_EQUAL(v[1], 1); 
-    ASSERT_EQUAL(v[2], 3); 
-    ASSERT_EQUAL(v[3], 4); 
-    
+    ASSERT_EQUAL(v.size(), 4lu);
+    ASSERT_EQUAL(v[0], 0);
+    ASSERT_EQUAL(v[1], 1);
+    ASSERT_EQUAL(v[2], 3);
+    ASSERT_EQUAL(v[3], 4);
+
     v.erase(v.begin() + 0);
 
-    ASSERT_EQUAL(v.size(), 3lu); 
-    ASSERT_EQUAL(v[0], 1); 
-    ASSERT_EQUAL(v[1], 3); 
-    ASSERT_EQUAL(v[2], 4); 
-    
+    ASSERT_EQUAL(v.size(), 3lu);
+    ASSERT_EQUAL(v[0], 1);
+    ASSERT_EQUAL(v[1], 3);
+    ASSERT_EQUAL(v[2], 4);
+
     v.erase(v.begin() + 2);
 
-    ASSERT_EQUAL(v.size(), 2lu); 
-    ASSERT_EQUAL(v[0], 1); 
-    ASSERT_EQUAL(v[1], 3); 
-    
+    ASSERT_EQUAL(v.size(), 2lu);
+    ASSERT_EQUAL(v[0], 1);
+    ASSERT_EQUAL(v[1], 3);
+
     v.erase(v.begin() + 1);
 
-    ASSERT_EQUAL(v.size(), 1lu); 
-    ASSERT_EQUAL(v[0], 1); 
+    ASSERT_EQUAL(v.size(), 1lu);
+    ASSERT_EQUAL(v[0], 1);
 
     v.erase(v.begin() + 0);
 
-    ASSERT_EQUAL(v.size(), 0lu); 
+    ASSERT_EQUAL(v.size(), 0lu);
 }
 DECLARE_VECTOR_UNITTEST(TestVectorErasePosition);
 
@@ -402,26 +435,26 @@ void TestVectorEraseRange(void)
 
     v.erase(v.begin() + 1, v.begin() + 3);
 
-    ASSERT_EQUAL(v.size(), 4lu); 
-    ASSERT_EQUAL(v[0], 0); 
-    ASSERT_EQUAL(v[1], 3); 
-    ASSERT_EQUAL(v[2], 4); 
-    ASSERT_EQUAL(v[3], 5); 
-    
+    ASSERT_EQUAL(v.size(), 4lu);
+    ASSERT_EQUAL(v[0], 0);
+    ASSERT_EQUAL(v[1], 3);
+    ASSERT_EQUAL(v[2], 4);
+    ASSERT_EQUAL(v[3], 5);
+
     v.erase(v.begin() + 2, v.end());
 
-    ASSERT_EQUAL(v.size(), 2lu); 
-    ASSERT_EQUAL(v[0], 0); 
-    ASSERT_EQUAL(v[1], 3); 
-    
+    ASSERT_EQUAL(v.size(), 2lu);
+    ASSERT_EQUAL(v[0], 0);
+    ASSERT_EQUAL(v[1], 3);
+
     v.erase(v.begin() + 0, v.begin() + 1);
 
-    ASSERT_EQUAL(v.size(), 1lu); 
-    ASSERT_EQUAL(v[0], 3); 
-    
+    ASSERT_EQUAL(v.size(), 1lu);
+    ASSERT_EQUAL(v[0], 3);
+
     v.erase(v.begin(), v.end());
 
-    ASSERT_EQUAL(v.size(), 0lu); 
+    ASSERT_EQUAL(v.size(), 0lu);
 }
 DECLARE_VECTOR_UNITTEST(TestVectorEraseRange);
 
@@ -449,21 +482,21 @@ void TestVectorEquality(void)
     s_b[0] = 0;    s_b[1] = 1;    s_b[2] = 3;
     s_b[0] = 0;    s_b[1] = 1;
 
-    ASSERT_EQUAL((h_a == h_a), true); ASSERT_EQUAL((h_a == d_a), true); ASSERT_EQUAL((d_a == h_a), true);  ASSERT_EQUAL((d_a == d_a), true); 
+    ASSERT_EQUAL((h_a == h_a), true); ASSERT_EQUAL((h_a == d_a), true); ASSERT_EQUAL((d_a == h_a), true);  ASSERT_EQUAL((d_a == d_a), true);
     ASSERT_EQUAL((h_b == h_b), true); ASSERT_EQUAL((h_b == d_b), true); ASSERT_EQUAL((d_b == h_b), true);  ASSERT_EQUAL((d_b == d_b), true);
     ASSERT_EQUAL((h_c == h_c), true); ASSERT_EQUAL((h_c == d_c), true); ASSERT_EQUAL((d_c == h_c), true);  ASSERT_EQUAL((d_c == d_c), true);
 
     // test vector vs device_vector
-    ASSERT_EQUAL((s_a == d_a), true); ASSERT_EQUAL((d_a == s_a), true); 
+    ASSERT_EQUAL((s_a == d_a), true); ASSERT_EQUAL((d_a == s_a), true);
     ASSERT_EQUAL((s_b == d_b), true); ASSERT_EQUAL((d_b == s_b), true);
     ASSERT_EQUAL((s_c == d_c), true); ASSERT_EQUAL((d_c == s_c), true);
 
     // test vector vs host_vector
-    ASSERT_EQUAL((s_a == h_a), true); ASSERT_EQUAL((h_a == s_a), true); 
+    ASSERT_EQUAL((s_a == h_a), true); ASSERT_EQUAL((h_a == s_a), true);
     ASSERT_EQUAL((s_b == h_b), true); ASSERT_EQUAL((h_b == s_b), true);
     ASSERT_EQUAL((s_c == h_c), true); ASSERT_EQUAL((h_c == s_c), true);
 
-    ASSERT_EQUAL((h_a == h_b), false); ASSERT_EQUAL((h_a == d_b), false); ASSERT_EQUAL((d_a == h_b), false); ASSERT_EQUAL((d_a == d_b), false); 
+    ASSERT_EQUAL((h_a == h_b), false); ASSERT_EQUAL((h_a == d_b), false); ASSERT_EQUAL((d_a == h_b), false); ASSERT_EQUAL((d_a == d_b), false);
     ASSERT_EQUAL((h_b == h_a), false); ASSERT_EQUAL((h_b == d_a), false); ASSERT_EQUAL((d_b == h_a), false); ASSERT_EQUAL((d_b == d_a), false);
     ASSERT_EQUAL((h_a == h_c), false); ASSERT_EQUAL((h_a == d_c), false); ASSERT_EQUAL((d_a == h_c), false); ASSERT_EQUAL((d_a == d_c), false);
     ASSERT_EQUAL((h_c == h_a), false); ASSERT_EQUAL((h_c == d_a), false); ASSERT_EQUAL((d_c == h_a), false); ASSERT_EQUAL((d_c == d_a), false);
@@ -471,7 +504,7 @@ void TestVectorEquality(void)
     ASSERT_EQUAL((h_c == h_b), false); ASSERT_EQUAL((h_c == d_b), false); ASSERT_EQUAL((d_c == h_b), false); ASSERT_EQUAL((d_c == d_b), false);
 
     // test vector vs device_vector
-    ASSERT_EQUAL((s_a == d_b), false); ASSERT_EQUAL((d_a == s_b), false); 
+    ASSERT_EQUAL((s_a == d_b), false); ASSERT_EQUAL((d_a == s_b), false);
     ASSERT_EQUAL((s_b == d_a), false); ASSERT_EQUAL((d_b == s_a), false);
     ASSERT_EQUAL((s_a == d_c), false); ASSERT_EQUAL((d_a == s_c), false);
     ASSERT_EQUAL((s_c == d_a), false); ASSERT_EQUAL((d_c == s_a), false);
@@ -479,7 +512,7 @@ void TestVectorEquality(void)
     ASSERT_EQUAL((s_c == d_b), false); ASSERT_EQUAL((d_c == s_b), false);
 
     // test vector vs host_vector
-    ASSERT_EQUAL((s_a == h_b), false); ASSERT_EQUAL((h_a == s_b), false); 
+    ASSERT_EQUAL((s_a == h_b), false); ASSERT_EQUAL((h_a == s_b), false);
     ASSERT_EQUAL((s_b == h_a), false); ASSERT_EQUAL((h_b == s_a), false);
     ASSERT_EQUAL((s_a == h_c), false); ASSERT_EQUAL((h_a == s_c), false);
     ASSERT_EQUAL((s_c == h_a), false); ASSERT_EQUAL((h_c == s_a), false);
@@ -511,21 +544,21 @@ void TestVectorInequality(void)
     s_b[0] = 0;    s_b[1] = 1;    s_b[2] = 3;
     s_b[0] = 0;    s_b[1] = 1;
 
-    ASSERT_EQUAL((h_a != h_a), false); ASSERT_EQUAL((h_a != d_a), false); ASSERT_EQUAL((d_a != h_a), false);  ASSERT_EQUAL((d_a != d_a), false); 
+    ASSERT_EQUAL((h_a != h_a), false); ASSERT_EQUAL((h_a != d_a), false); ASSERT_EQUAL((d_a != h_a), false);  ASSERT_EQUAL((d_a != d_a), false);
     ASSERT_EQUAL((h_b != h_b), false); ASSERT_EQUAL((h_b != d_b), false); ASSERT_EQUAL((d_b != h_b), false);  ASSERT_EQUAL((d_b != d_b), false);
     ASSERT_EQUAL((h_c != h_c), false); ASSERT_EQUAL((h_c != d_c), false); ASSERT_EQUAL((d_c != h_c), false);  ASSERT_EQUAL((d_c != d_c), false);
 
     // test vector vs device_vector
-    ASSERT_EQUAL((s_a != d_a), false); ASSERT_EQUAL((d_a != s_a), false); 
+    ASSERT_EQUAL((s_a != d_a), false); ASSERT_EQUAL((d_a != s_a), false);
     ASSERT_EQUAL((s_b != d_b), false); ASSERT_EQUAL((d_b != s_b), false);
     ASSERT_EQUAL((s_c != d_c), false); ASSERT_EQUAL((d_c != s_c), false);
 
     // test vector vs host_vector
-    ASSERT_EQUAL((s_a != h_a), false); ASSERT_EQUAL((h_a != s_a), false); 
+    ASSERT_EQUAL((s_a != h_a), false); ASSERT_EQUAL((h_a != s_a), false);
     ASSERT_EQUAL((s_b != h_b), false); ASSERT_EQUAL((h_b != s_b), false);
     ASSERT_EQUAL((s_c != h_c), false); ASSERT_EQUAL((h_c != s_c), false);
 
-    ASSERT_EQUAL((h_a != h_b), true); ASSERT_EQUAL((h_a != d_b), true); ASSERT_EQUAL((d_a != h_b), true); ASSERT_EQUAL((d_a != d_b), true); 
+    ASSERT_EQUAL((h_a != h_b), true); ASSERT_EQUAL((h_a != d_b), true); ASSERT_EQUAL((d_a != h_b), true); ASSERT_EQUAL((d_a != d_b), true);
     ASSERT_EQUAL((h_b != h_a), true); ASSERT_EQUAL((h_b != d_a), true); ASSERT_EQUAL((d_b != h_a), true); ASSERT_EQUAL((d_b != d_a), true);
     ASSERT_EQUAL((h_a != h_c), true); ASSERT_EQUAL((h_a != d_c), true); ASSERT_EQUAL((d_a != h_c), true); ASSERT_EQUAL((d_a != d_c), true);
     ASSERT_EQUAL((h_c != h_a), true); ASSERT_EQUAL((h_c != d_a), true); ASSERT_EQUAL((d_c != h_a), true); ASSERT_EQUAL((d_c != d_a), true);
@@ -533,7 +566,7 @@ void TestVectorInequality(void)
     ASSERT_EQUAL((h_c != h_b), true); ASSERT_EQUAL((h_c != d_b), true); ASSERT_EQUAL((d_c != h_b), true); ASSERT_EQUAL((d_c != d_b), true);
 
     // test vector vs device_vector
-    ASSERT_EQUAL((s_a != d_b), true); ASSERT_EQUAL((d_a != s_b), true); 
+    ASSERT_EQUAL((s_a != d_b), true); ASSERT_EQUAL((d_a != s_b), true);
     ASSERT_EQUAL((s_b != d_a), true); ASSERT_EQUAL((d_b != s_a), true);
     ASSERT_EQUAL((s_a != d_c), true); ASSERT_EQUAL((d_a != s_c), true);
     ASSERT_EQUAL((s_c != d_a), true); ASSERT_EQUAL((d_c != s_a), true);
@@ -541,7 +574,7 @@ void TestVectorInequality(void)
     ASSERT_EQUAL((s_c != d_b), true); ASSERT_EQUAL((d_c != s_b), true);
 
     // test vector vs host_vector
-    ASSERT_EQUAL((s_a != h_b), true); ASSERT_EQUAL((h_a != s_b), true); 
+    ASSERT_EQUAL((s_a != h_b), true); ASSERT_EQUAL((h_a != s_b), true);
     ASSERT_EQUAL((s_b != h_a), true); ASSERT_EQUAL((h_b != s_a), true);
     ASSERT_EQUAL((s_a != h_c), true); ASSERT_EQUAL((h_a != s_c), true);
     ASSERT_EQUAL((s_c != h_a), true); ASSERT_EQUAL((h_c != s_a), true);
@@ -585,8 +618,8 @@ void TestVectorResizing(void)
 
     ASSERT_EQUAL(v.size(), 0lu);
 
-// TODO remove this WAR      
-#if defined(__CUDACC__) && CUDA_VERSION==3000
+// TODO remove this WAR
+#if defined(__CUDACC__) && CUDART_VERSION==3000
     // depending on sizeof(T), we will receive one
     // of two possible exceptions
     try
@@ -599,7 +632,7 @@ void TestVectorResizing(void)
       // reset the CUDA error
       cudaGetLastError();
     } // end catch
-#endif // defined(__CUDACC__) && CUDA_VERSION==3000
+#endif // defined(__CUDACC__) && CUDART_VERSION==3000
 
     ASSERT_EQUAL(v.size(), 0lu);
 }
@@ -622,15 +655,15 @@ void TestVectorReserving(void)
 
     ASSERT_EQUAL(v.capacity(), old_capacity);
 
-// TODO remove this WAR      
-#if defined(__CUDACC__) && CUDA_VERSION==3000
+// TODO remove this WAR
+#if defined(__CUDACC__) && CUDART_VERSION==3000
     try
     {
       v.reserve(std::numeric_limits<size_t>::max());
     }
     catch(std::length_error e) {}
     catch(std::bad_alloc e) {}
-#endif // defined(__CUDACC__) && CUDA_VERSION==3000
+#endif // defined(__CUDACC__) && CUDART_VERSION==3000
 
     ASSERT_EQUAL(v.capacity(), old_capacity);
 }
@@ -638,6 +671,19 @@ DECLARE_VECTOR_UNITTEST(TestVectorReserving)
 
 
 
+template <class Vector>
+void TestVectorUninitialisedCopy(void)
+{
+    thrust::device_vector<int> v;
+    std::vector<int> std_vector;
+
+    v = std_vector;
+
+    ASSERT_EQUAL(v.size(), static_cast<size_t>(0));
+}
+DECLARE_VECTOR_UNITTEST(TestVectorUninitialisedCopy);
+
+
 template <class Vector>
 void TestVectorShrinkToFit(void)
 {
@@ -680,7 +726,7 @@ struct LargeStruct
 
 void TestVectorContainingLargeType(void)
 {
-    // Thrust issue #5 
+    // Thrust issue #5
     // http://code.google.com/p/thrust/issues/detail?id=5
     const static int N = 100;
     typedef LargeStruct<N> T;
@@ -692,9 +738,9 @@ void TestVectorContainingLargeType(void)
 
     thrust::device_vector<T> dv2(20);
     thrust::host_vector<T>   hv2(20);
-    
+
     ASSERT_EQUAL_QUIET(dv2, hv2);
-    
+
     // initialize tofirst element to something nonzero
     T ls;
 
@@ -703,15 +749,15 @@ void TestVectorContainingLargeType(void)
 
     thrust::device_vector<T> dv3(20, ls);
     thrust::host_vector<T>   hv3(20, ls);
-    
+
     ASSERT_EQUAL_QUIET(dv3, hv3);
-    
+
     // change first element
     ls.data[0] = -13;
 
     dv3[2] = ls;
     hv3[2] = ls;
-    
+
     ASSERT_EQUAL_QUIET(dv3, hv3);
 }
 DECLARE_UNITTEST(TestVectorContainingLargeType);
@@ -739,7 +785,7 @@ void TestVectorReversed(void)
 }
 DECLARE_VECTOR_UNITTEST(TestVectorReversed);
 
-#if __cplusplus >= 201103L
+#if THRUST_CPP_DIALECT >= 2011
   template <class Vector>
   void TestVectorMove(void)
   {
diff --git a/testing/vector_allocators.cu b/testing/vector_allocators.cu
index 00535d1b0..568ea7ff6 100644
--- a/testing/vector_allocators.cu
+++ b/testing/vector_allocators.cu
@@ -1,10 +1,14 @@
 #include <unittest/unittest.h>
+
+#include <thrust/detail/config.h>
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
 
 template<typename BaseAlloc, bool PropagateOnSwap>
 class stateful_allocator : public BaseAlloc
 {
+  typedef thrust::detail::allocator_traits<BaseAlloc> base_traits;
+
 public:
     stateful_allocator(int i) : state(i)
     {
@@ -23,7 +27,7 @@ public:
         return *this;
     }
 
-#if __cplusplus >= 201103L
+#if THRUST_CPP_DIALECT >= 2011
     stateful_allocator(stateful_allocator && other)
         : BaseAlloc(std::move(other)), state(other.state)
     {
@@ -41,20 +45,35 @@ public:
     static int last_allocated;
     static int last_deallocated;
 
-    typedef
-        typename thrust::detail::allocator_traits<BaseAlloc>::pointer
-        pointer;
+    typedef typename base_traits::pointer pointer;
+    typedef typename base_traits::const_pointer const_pointer;
+    typedef typename base_traits::reference reference;
+    typedef typename base_traits::const_reference const_reference;
 
     pointer allocate(std::size_t size)
     {
+        BaseAlloc alloc;
         last_allocated = state;
-        return BaseAlloc::allocate(size);
+        return base_traits::allocate(alloc, size);
     }
 
     void deallocate(pointer ptr, std::size_t size)
     {
+        BaseAlloc alloc;
         last_deallocated = state;
-        return BaseAlloc::deallocate(ptr, size);
+        return base_traits::deallocate(alloc, ptr, size);
+    }
+
+    static void construct(pointer ptr)
+    {
+      BaseAlloc alloc;
+      return base_traits::construct(alloc, ptr);
+    }
+
+    static void destroy(pointer ptr)
+    {
+      BaseAlloc alloc;
+      return base_traits::destroy(alloc, ptr);
     }
 
     bool operator==(const stateful_allocator &rhs) const
@@ -129,7 +148,7 @@ void TestVectorAllocatorConstructors()
     ASSERT_EQUAL(Alloc::last_allocated, 2);
     Alloc::last_allocated = 0;
 
-#if __cplusplus >= 201103L
+#if THRUST_CPP_DIALECT >= 2011
     // FIXME: uncomment this after the vector_base(vector_base&&, const Alloc&)
     // is fixed and implemented
     // Vector v5(std::move(v3), alloc2);
@@ -188,7 +207,7 @@ void TestVectorAllocatorPropagateOnCopyAssignmentDevice()
 }
 DECLARE_UNITTEST(TestVectorAllocatorPropagateOnCopyAssignmentDevice);
 
-#if __cplusplus >= 201103L
+#if THRUST_CPP_DIALECT >= 2011
 template<typename Vector>
 void TestVectorAllocatorPropagateOnMoveAssignment()
 {
diff --git a/testing/zip_function.cu b/testing/zip_function.cu
new file mode 100644
index 000000000..a1545a1a1
--- /dev/null
+++ b/testing/zip_function.cu
@@ -0,0 +1,70 @@
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2011 && !defined(THRUST_LEGACY_GCC)
+
+#include <unittest/unittest.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/transform.h>
+#include <thrust/zip_function.h>
+
+#include <iostream>
+
+using namespace unittest;
+
+struct SumThree
+{
+  template <typename T1, typename T2, typename T3>
+  __host__ __device__
+  auto operator()(T1 x, T2 y, T3 z) const
+  THRUST_DECLTYPE_RETURNS(x + y + z)
+}; // end SumThree
+
+struct SumThreeTuple
+{
+  template <typename Tuple>
+  __host__ __device__
+  auto operator()(Tuple x) const
+  THRUST_DECLTYPE_RETURNS(thrust::get<0>(x) + thrust::get<1>(x) + thrust::get<2>(x))
+}; // end SumThreeTuple
+
+template <typename T>
+struct TestZipFunctionTransform
+{
+  void operator()(const size_t n)
+  {
+    using namespace thrust;
+
+    host_vector<T> h_data0 = unittest::random_samples<T>(n);
+    host_vector<T> h_data1 = unittest::random_samples<T>(n);
+    host_vector<T> h_data2 = unittest::random_samples<T>(n);
+
+    device_vector<T> d_data0 = h_data0;
+    device_vector<T> d_data1 = h_data1;
+    device_vector<T> d_data2 = h_data2;
+
+    host_vector<T>   h_result_tuple(n);
+    host_vector<T>   h_result_zip(n);
+    device_vector<T> d_result_zip(n);
+
+    // Tuple base case
+    transform(make_zip_iterator(make_tuple(h_data0.begin(), h_data1.begin(), h_data2.begin())),
+              make_zip_iterator(make_tuple(h_data0.end(),   h_data1.end(),   h_data2.end())),
+              h_result_tuple.begin(),
+              SumThreeTuple{});
+    // Zip Function
+    transform(make_zip_iterator(make_tuple(h_data0.begin(), h_data1.begin(), h_data2.begin())),
+              make_zip_iterator(make_tuple(h_data0.end(),   h_data1.end(),   h_data2.end())),
+              h_result_zip.begin(),
+              make_zip_function(SumThree{}));
+    transform(make_zip_iterator(make_tuple(d_data0.begin(), d_data1.begin(), d_data2.begin())),
+              make_zip_iterator(make_tuple(d_data0.end(),   d_data1.end(),   d_data2.end())),
+              d_result_zip.begin(),
+              make_zip_function(SumThree{}));
+
+    ASSERT_EQUAL(h_result_tuple, h_result_zip);
+    ASSERT_EQUAL(h_result_tuple, d_result_zip);
+  }
+};
+VariableUnitTest<TestZipFunctionTransform, ThirtyTwoBitTypes> TestZipFunctionTransformInstance;
+
+#endif // THRUST_CPP_DIALECT
diff --git a/testing/zip_iterator.cu b/testing/zip_iterator.cu
index 3ea34b25f..c48ca2170 100644
--- a/testing/zip_iterator.cu
+++ b/testing/zip_iterator.cu
@@ -276,13 +276,14 @@ template <typename Vector>
 void TestZipIteratorCopy(void)
 {
   using namespace thrust;
+  using T = typename Vector::value_type;
 
   Vector input0(4),  input1(4);
   Vector output0(4), output1(4);
 
   // initialize input
-  sequence(input0.begin(), input0.end(),  0);
-  sequence(input1.begin(), input1.end(), 13);
+  sequence(input0.begin(), input0.end(), T{0});
+  sequence(input1.begin(), input1.end(), T{13});
 
   copy( make_zip_iterator(make_tuple(input0.begin(),  input1.begin())),
         make_zip_iterator(make_tuple(input0.end(),    input1.end())),
diff --git a/testing/zip_iterator_scan.cu b/testing/zip_iterator_scan.cu
index 9fb767a68..96ace6d76 100644
--- a/testing/zip_iterator_scan.cu
+++ b/testing/zip_iterator_scan.cu
@@ -40,18 +40,6 @@ struct TestZipIteratorScan
     host_vector<Tuple>   h_result(n);
     device_vector<Tuple> d_result(n);
 
-    // The tests below get miscompiled on Tesla hw for 8b types
-
-#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
-    if(const CUDATestDriver *driver = dynamic_cast<const CUDATestDriver*>(&UnitTestDriver::s_driver()))
-    {
-      if(sizeof(T) == sizeof(unittest::uint8_t) && driver->current_device_architecture() < 200)
-      {
-        KNOWN_FAILURE;
-      } // end if
-    } // end if
-#endif
-
     // inclusive_scan (tuple output)
     inclusive_scan( make_zip_iterator(make_tuple(h_data0.begin(), h_data1.begin())),
                     make_zip_iterator(make_tuple(h_data0.end(),   h_data1.end())),
diff --git a/thrust.vlcc b/thrust.vlcc
deleted file mode 100644
index c3c860f5d..000000000
--- a/thrust.vlcc
+++ /dev/null
@@ -1,19 +0,0 @@
-# thrust component
-{
-  # Descriptive name for the component
-  "name"      : "Thrust Library",
-  # Component owner (email address)
-  "owner"     : "blelbach@nvidia.com",
-  "module"    : "CUDA - Thrust",
-
-  # Files included in this component specified with one or more paths.
-  # Paths are relative to this file location. Path syntax uses wildcards and p4-like '...'.
-   "files"     : [ "..."           
-                 ],
-  # Output produced by this component and the installation location
-  # for each output. The install location is relative to
-  # VULCAN_INSTALL_DIR. Optional 'kind' property can be used to tag
-  # artifact kinds.
-   "artifacts" : [ { "thrust/*"            : "cuda/${INSTALL_TARGET_DIR}/include/thrust/." }
-                 ]
-}
diff --git a/thrust/addressof.h b/thrust/addressof.h
index 5d4dbf349..d21df0c76 100644
--- a/thrust/addressof.h
+++ b/thrust/addressof.h
@@ -8,10 +8,10 @@
 #include <thrust/detail/config.h>
 
 #if THRUST_CPP_DIALECT >= 2011
-#  include <memory>
+#  include <thrust/detail/memory_wrapper.h>
 #endif
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
 
 ///////////////////////////////////////////////////////////////////////////////
 
@@ -28,5 +28,4 @@ T* addressof(T& arg)
 
 ///////////////////////////////////////////////////////////////////////////////
 
-THRUST_END_NS
-
+THRUST_NAMESPACE_END
diff --git a/thrust/adjacent_difference.h b/thrust/adjacent_difference.h
index 838beabe5..e8385c240 100644
--- a/thrust/adjacent_difference.h
+++ b/thrust/adjacent_difference.h
@@ -24,9 +24,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/detail/execution_policy.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup transformations Transformations
  *  \{
@@ -51,11 +49,11 @@ namespace thrust
  *  \return The iterator <tt>result + (last - first)</tt>
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \c x and \c y are objects of \p InputIterator's \c value_type, then \c x - \c is defined,
  *          and \p InputIterator's \c value_type is convertible to a type in \p OutputIterator's set of \c value_types,
  *          and the return type of <tt>x - y</tt> is convertible to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *
  *  \remark Note that \p result is permitted to be the same iterator as \p first. This is
  *          useful for computing differences "in place".
@@ -77,7 +75,7 @@ namespace thrust
  *  // d_result is now [1, 1, -1, 1, -1, 1, -1, 1]
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/adjacent_difference.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/adjacent_difference
  *  \see inclusive_scan
  */
 template<typename DerivedPolicy, typename InputIterator, typename OutputIterator>
@@ -105,10 +103,10 @@ OutputIterator adjacent_difference(const thrust::detail::execution_policy_base<D
  *  \return The iterator <tt>result + (last - first)</tt>
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator's \c value_type is convertible to \p BinaryFunction's \c first_argument_type and \c second_argument_type,
  *          and \p InputIterator's \c value_type is convertible to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *  \tparam BinaryFunction's \c result_type is convertible to a type in \p OutputIterator's set of \c value_types.
  *
  *  \remark Note that \p result is permitted to be the same iterator as \p first. This is
@@ -132,7 +130,7 @@ OutputIterator adjacent_difference(const thrust::detail::execution_policy_base<D
  *  // d_result is now [1, 3, 3, 3, 3, 3, 3, 3]
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/adjacent_difference.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/adjacent_difference
  *  \see inclusive_scan
  */
 template<typename DerivedPolicy, typename InputIterator, typename OutputIterator, typename BinaryFunction>
@@ -156,11 +154,11 @@ OutputIterator adjacent_difference(const thrust::detail::execution_policy_base<D
  *  \param result The beginning of the output range.
  *  \return The iterator <tt>result + (last - first)</tt>
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \c x and \c y are objects of \p InputIterator's \c value_type, then \c x - \c is defined,
  *          and \p InputIterator's \c value_type is convertible to a type in \p OutputIterator's set of \c value_types,
  *          and the return type of <tt>x - y</tt> is convertible to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *
  *  \remark Note that \p result is permitted to be the same iterator as \p first. This is
  *          useful for computing differences "in place".
@@ -181,7 +179,7 @@ OutputIterator adjacent_difference(const thrust::detail::execution_policy_base<D
  *  // d_result is now [1, 1, -1, 1, -1, 1, -1, 1]
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/adjacent_difference.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/adjacent_difference
  *  \see inclusive_scan
  */
 template <typename InputIterator, typename OutputIterator>
@@ -203,10 +201,10 @@ OutputIterator adjacent_difference(InputIterator first, InputIterator last,
  *  \param binary_op The binary function used to compute differences.
  *  \return The iterator <tt>result + (last - first)</tt>
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator's \c value_type is convertible to \p BinaryFunction's \c first_argument_type and \c second_argument_type,
  *          and \p InputIterator's \c value_type is convertible to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *  \tparam BinaryFunction's \c result_type is convertible to a type in \p OutputIterator's set of \c value_types.
  *
  *  \remark Note that \p result is permitted to be the same iterator as \p first. This is
@@ -229,7 +227,7 @@ OutputIterator adjacent_difference(InputIterator first, InputIterator last,
  *  // d_result is now [1, 3, 3, 3, 3, 3, 3, 3]
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/adjacent_difference.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/adjacent_difference
  *  \see inclusive_scan
  */
 template <typename InputIterator, typename OutputIterator, typename BinaryFunction>
@@ -240,7 +238,7 @@ OutputIterator adjacent_difference(InputIterator first, InputIterator last,
 /*! \}
  */
 
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/adjacent_difference.inl>
 
diff --git a/thrust/advance.h b/thrust/advance.h
index d077e0434..a5162e203 100644
--- a/thrust/advance.h
+++ b/thrust/advance.h
@@ -23,8 +23,7 @@
 
 #include <thrust/detail/config.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup iterators
  *  \{
@@ -38,7 +37,7 @@ namespace thrust
  *  \param i The iterator to be advanced.
  *  \param n The distance by which to advance the iterator.
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>.
  *  \tparam Distance is an integral type that is convertible to \p InputIterator's distance type.
  *
  *  \pre \p n shall be negative only for bidirectional and random access iterators.
@@ -58,7 +57,7 @@ namespace thrust
  *  // iter - vec.begin() == 7
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/advance.html
+ *  \see https://en.cppreference.com/w/cpp/iterator/advance
  */
 template <typename InputIterator, typename Distance>
 __host__ __device__
@@ -135,7 +134,7 @@ BidirectionalIterator prev(
 /*! \} // end iterators
  */
 
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/advance.inl>
 
diff --git a/thrust/allocate_unique.h b/thrust/allocate_unique.h
index 5daec97e0..ff10cb51c 100644
--- a/thrust/allocate_unique.h
+++ b/thrust/allocate_unique.h
@@ -16,9 +16,9 @@
 #include <thrust/detail/allocator/allocator_traits.h>
 
 #include <utility>
-#include <memory>
+#include <thrust/detail/memory_wrapper.h>
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
 
 // wg21.link/p0316r0
 
@@ -437,7 +437,7 @@ uninitialized_allocate_unique_n(
 
 ///////////////////////////////////////////////////////////////////////////////
 
-THRUST_END_NS
+THRUST_NAMESPACE_END
 
 #endif // THRUST_CPP_DIALECT >= 2011
 
diff --git a/thrust/async/copy.h b/thrust/async/copy.h
index b5923be2c..a8edc7411 100644
--- a/thrust/async/copy.h
+++ b/thrust/async/copy.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2018 NVIDIA Corporation
+ *  Copyright 2008-2021 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -14,17 +14,16 @@
  *  limitations under the License.
  */
 
-/*! \file async/copy.h
- *  \brief Functions for asynchronously copying a range.
+/*! \file
+ *  \brief Algorithms for asynchronously copying a range.
  */
 
 #pragma once
 
 #include <thrust/detail/config.h>
-#include <thrust/detail/cpp11_required.h>
-#include <thrust/detail/modern_gcc_required.h>
+#include <thrust/detail/cpp14_required.h>
 
-#if THRUST_CPP_DIALECT >= 2011 && !defined(THRUST_LEGACY_GCC)
+#if THRUST_CPP_DIALECT >= 2014
 
 #include <thrust/detail/static_assert.h>
 #include <thrust/detail/select_system.h>
@@ -33,11 +32,14 @@
 
 #include <thrust/event.h>
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
 
 namespace async
 {
 
+/*! \cond
+ */
+
 namespace unimplemented
 {
 
@@ -58,7 +60,7 @@ async_copy(
   , "this algorithm is not implemented for the specified system"
   );
   return {};
-} 
+}
 
 } // namespace unimplemented
 
@@ -78,10 +80,10 @@ struct copy_fn final
     thrust::detail::execution_policy_base<FromPolicy> const& from_exec
   , thrust::detail::execution_policy_base<ToPolicy> const&   to_exec
   , ForwardIt&& first, Sentinel&& last
-  , OutputIt&& output 
+  , OutputIt&& output
   )
   // ADL dispatch.
-  THRUST_DECLTYPE_RETURNS(
+  THRUST_RETURNS(
     async_copy(
       thrust::detail::derived_cast(thrust::detail::strip_const(from_exec))
     , thrust::detail::derived_cast(thrust::detail::strip_const(to_exec))
@@ -98,13 +100,16 @@ struct copy_fn final
   static auto call(
     thrust::detail::execution_policy_base<DerivedPolicy> const& exec
   , ForwardIt&& first, Sentinel&& last
-  , OutputIt&& output 
-  ) 
-  // ADL dispatch.
-  THRUST_DECLTYPE_RETURNS(
-    async_copy(
+  , OutputIt&& output
+  )
+  THRUST_RETURNS(
+    copy_fn::call(
       thrust::detail::derived_cast(thrust::detail::strip_const(exec))
-    , thrust::detail::derived_cast(thrust::detail::strip_const(exec))
+      // Synthesize a suitable new execution policy, because we don't want to
+      // try and extract twice from the one we were passed.
+    , typename remove_cvref_t<
+        decltype(thrust::detail::derived_cast(thrust::detail::strip_const(exec)))
+      >::tag_type{}
     , THRUST_FWD(first), THRUST_FWD(last)
     , THRUST_FWD(output)
     )
@@ -112,8 +117,8 @@ struct copy_fn final
 
   template <typename ForwardIt, typename Sentinel, typename OutputIt>
   __host__
-  static auto call(ForwardIt&& first, Sentinel&& last, OutputIt&& output) 
-  THRUST_DECLTYPE_RETURNS(
+  static auto call(ForwardIt&& first, Sentinel&& last, OutputIt&& output)
+  THRUST_RETURNS(
     copy_fn::call(
       thrust::detail::select_system(
         typename thrust::iterator_system<remove_cvref_t<ForwardIt>>::type{}
@@ -129,7 +134,7 @@ struct copy_fn final
   template <typename... Args>
   THRUST_NODISCARD __host__
   auto operator()(Args&&... args) const
-  THRUST_DECLTYPE_RETURNS(
+  THRUST_RETURNS(
     call(THRUST_FWD(args)...)
   )
 };
@@ -138,9 +143,12 @@ struct copy_fn final
 
 THRUST_INLINE_CONSTANT copy_detail::copy_fn copy{};
 
+/*! \endcond
+ */
+
 } // namespace async
 
-THRUST_END_NS
+THRUST_NAMESPACE_END
 
 #endif
 
diff --git a/thrust/async/for_each.h b/thrust/async/for_each.h
index 3bd86a692..0d3b3a189 100644
--- a/thrust/async/for_each.h
+++ b/thrust/async/for_each.h
@@ -1,9 +1,9 @@
 /*
- *  Copyright 2008-2018 NVIDIA Corporation
+ *  Copyright 2008-2021 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
- *  You may obtain a for_each of the License at
+ *  You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
@@ -14,17 +14,16 @@
  *  limitations under the License.
  */
 
-/*! \file async/for_each.h
- *  \brief Functions for asynchronously iterating over the elements of a range.
+/*! \file
+ *  \brief Algorithms for asynchronously iterating over the elements of a range.
  */
 
 #pragma once
 
 #include <thrust/detail/config.h>
-#include <thrust/detail/cpp11_required.h>
-#include <thrust/detail/modern_gcc_required.h>
+#include <thrust/detail/cpp14_required.h>
 
-#if THRUST_CPP_DIALECT >= 2011 && !defined(THRUST_LEGACY_GCC)
+#if THRUST_CPP_DIALECT >= 2014
 
 #include <thrust/detail/static_assert.h>
 #include <thrust/detail/select_system.h>
@@ -33,11 +32,14 @@
 
 #include <thrust/event.h>
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
 
 namespace async
 {
 
+/*! \cond
+ */
+
 namespace unimplemented
 {
 
@@ -56,13 +58,13 @@ async_for_each(
   , "this algorithm is not implemented for the specified system"
   );
   return {};
-} 
+}
 
 } // namespace unimplemented
 
 namespace for_each_detail
 {
-    
+
 using thrust::async::unimplemented::async_for_each;
 
 struct for_each_fn final
@@ -75,10 +77,10 @@ struct for_each_fn final
   static auto call(
     thrust::detail::execution_policy_base<DerivedPolicy> const& exec
   , ForwardIt&& first, Sentinel&& last
-  , UnaryFunction&& f 
+  , UnaryFunction&& f
   )
   // ADL dispatch.
-  THRUST_DECLTYPE_RETURNS(
+  THRUST_RETURNS(
     async_for_each(
       thrust::detail::derived_cast(thrust::detail::strip_const(exec))
     , THRUST_FWD(first), THRUST_FWD(last)
@@ -88,8 +90,8 @@ struct for_each_fn final
 
   template <typename ForwardIt, typename Sentinel, typename UnaryFunction>
   __host__
-  static auto call(ForwardIt&& first, Sentinel&& last, UnaryFunction&& f) 
-  THRUST_DECLTYPE_RETURNS(
+  static auto call(ForwardIt&& first, Sentinel&& last, UnaryFunction&& f)
+  THRUST_RETURNS(
     for_each_fn::call(
       thrust::detail::select_system(
         typename iterator_system<remove_cvref_t<ForwardIt>>::type{}
@@ -102,7 +104,7 @@ struct for_each_fn final
   template <typename... Args>
   THRUST_NODISCARD __host__
   auto operator()(Args&&... args) const
-  THRUST_DECLTYPE_RETURNS(
+  THRUST_RETURNS(
     call(THRUST_FWD(args)...)
   )
 };
@@ -111,9 +113,11 @@ struct for_each_fn final
 
 THRUST_INLINE_CONSTANT for_each_detail::for_each_fn for_each{};
 
+/*! \endcond
+ */
+
 } // namespace async
 
-THRUST_END_NS
+THRUST_NAMESPACE_END
 
 #endif
-
diff --git a/thrust/async/reduce.h b/thrust/async/reduce.h
index ab63d6224..8f4fe3133 100644
--- a/thrust/async/reduce.h
+++ b/thrust/async/reduce.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2018 NVIDIA Corporation
+ *  Copyright 2008-2021 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -14,17 +14,16 @@
  *  limitations under the License.
  */
 
-/*! \file async/reduce.h
- *  \brief Functions for asynchronously reducing a range to a single value.
+/*! \file
+ *  \brief Algorithms for asynchronously reducing a range to a single value.
  */
 
 #pragma once
 
 #include <thrust/detail/config.h>
-#include <thrust/detail/cpp11_required.h>
-#include <thrust/detail/modern_gcc_required.h>
+#include <thrust/detail/cpp14_required.h>
 
-#if THRUST_CPP_DIALECT >= 2011 && !defined(THRUST_LEGACY_GCC)
+#if THRUST_CPP_DIALECT >= 2014
 
 #include <thrust/detail/static_assert.h>
 #include <thrust/detail/select_system.h>
@@ -35,11 +34,14 @@
 
 #include <thrust/future.h>
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
 
 namespace async
 {
 
+/*! \cond
+ */
+
 namespace unimplemented
 {
 
@@ -47,7 +49,7 @@ template <
   typename DerivedPolicy
 , typename ForwardIt, typename Sentinel, typename T, typename BinaryOp
 >
-__host__ 
+__host__
 future<DerivedPolicy, T>
 async_reduce(
   thrust::execution_policy<DerivedPolicy>&, ForwardIt, Sentinel, T, BinaryOp
@@ -58,7 +60,7 @@ async_reduce(
   , "this algorithm is not implemented for the specified system"
   );
   return {};
-} 
+}
 
 } // namespace unimplemented
 
@@ -81,7 +83,7 @@ struct reduce_fn final
   , BinaryOp&& op
   )
   // ADL dispatch.
-  THRUST_DECLTYPE_RETURNS(
+  THRUST_RETURNS(
     async_reduce(
       thrust::detail::derived_cast(thrust::detail::strip_const(exec))
     , THRUST_FWD(first), THRUST_FWD(last)
@@ -95,13 +97,14 @@ struct reduce_fn final
   , typename ForwardIt, typename Sentinel, typename T
   >
   __host__
-  static auto call(
+  static auto call4(
     thrust::detail::execution_policy_base<DerivedPolicy> const& exec
   , ForwardIt&& first, Sentinel&& last
   , T&& init
+  , thrust::true_type
   )
   // ADL dispatch.
-  THRUST_DECLTYPE_RETURNS(
+  THRUST_RETURNS(
     async_reduce(
       thrust::detail::derived_cast(thrust::detail::strip_const(exec))
     , THRUST_FWD(first), THRUST_FWD(last)
@@ -116,12 +119,13 @@ struct reduce_fn final
   >
   __host__
   static auto
-  call(
+  call3(
     thrust::detail::execution_policy_base<DerivedPolicy> const& exec
   , ForwardIt&& first, Sentinel&& last
+  , thrust::true_type
   )
   // ADL dispatch.
-  THRUST_DECLTYPE_RETURNS(
+  THRUST_RETURNS(
     async_reduce(
       thrust::detail::derived_cast(thrust::detail::strip_const(exec))
     , THRUST_FWD(first), THRUST_FWD(last)
@@ -136,10 +140,12 @@ struct reduce_fn final
 
   template <typename ForwardIt, typename Sentinel, typename T, typename BinaryOp>
   __host__
-  static auto call(ForwardIt&& first, Sentinel&& last, T&& init, BinaryOp&& op)
-  THRUST_DECLTYPE_RETURNS_WITH_SFINAE_CONDITION(
-    (negation<is_execution_policy<remove_cvref_t<ForwardIt>>>::value)
-  , reduce_fn::call(
+  static auto call4(ForwardIt&& first, Sentinel&& last,
+                    T&& init,
+                    BinaryOp&& op,
+                    thrust::false_type)
+  THRUST_RETURNS(
+    reduce_fn::call(
       thrust::detail::select_system(
         typename iterator_system<remove_cvref_t<ForwardIt>>::type{}
       )
@@ -151,10 +157,11 @@ struct reduce_fn final
 
   template <typename ForwardIt, typename Sentinel, typename T>
   __host__
-  static auto call(ForwardIt&& first, Sentinel&& last, T&& init)
-  THRUST_DECLTYPE_RETURNS_WITH_SFINAE_CONDITION(
-    (negation<is_execution_policy<remove_cvref_t<ForwardIt>>>::value)
-  , reduce_fn::call(
+  static auto call3(ForwardIt&& first, Sentinel&& last,
+                    T&& init,
+                    thrust::false_type)
+  THRUST_RETURNS(
+    reduce_fn::call(
       thrust::detail::select_system(
         typename iterator_system<remove_cvref_t<ForwardIt>>::type{}
       )
@@ -164,10 +171,29 @@ struct reduce_fn final
     )
   )
 
+  // MSVC WAR: MSVC gets angsty and eats all available RAM when we try to detect
+  // if T1 is an execution_policy by using SFINAE. Switching to a static
+  // dispatch pattern to prevent this.
+  template <typename T1, typename T2, typename T3>
+  __host__
+  static auto call(T1&& t1, T2&& t2, T3&& t3)
+  THRUST_RETURNS(
+    reduce_fn::call3(THRUST_FWD(t1), THRUST_FWD(t2), THRUST_FWD(t3),
+                     thrust::is_execution_policy<thrust::remove_cvref_t<T1>>{})
+  )
+
+  template <typename T1, typename T2, typename T3, typename T4>
+  __host__
+  static auto call(T1&& t1, T2&& t2, T3&& t3, T4&& t4)
+  THRUST_RETURNS(
+    reduce_fn::call4(THRUST_FWD(t1), THRUST_FWD(t2), THRUST_FWD(t3), THRUST_FWD(t4),
+                     thrust::is_execution_policy<thrust::remove_cvref_t<T1>>{})
+  )
+
   template <typename ForwardIt, typename Sentinel>
   __host__
   static auto call(ForwardIt&& first, Sentinel&& last)
-  THRUST_DECLTYPE_RETURNS(
+  THRUST_RETURNS(
     reduce_fn::call(
       thrust::detail::select_system(
         typename iterator_system<remove_cvref_t<ForwardIt>>::type{}
@@ -183,9 +209,9 @@ struct reduce_fn final
   )
 
   template <typename... Args>
-  THRUST_NODISCARD __host__ 
+  THRUST_NODISCARD __host__
   auto operator()(Args&&... args) const
-  THRUST_DECLTYPE_RETURNS(
+  THRUST_RETURNS(
     call(THRUST_FWD(args)...)
   )
 };
@@ -216,7 +242,7 @@ async_reduce_into(
   , "this algorithm is not implemented for the specified system"
   );
   return {};
-} 
+}
 
 } // namespace unimplemented
 
@@ -241,7 +267,7 @@ struct reduce_into_fn final
   , BinaryOp&& op
   )
   // ADL dispatch.
-  THRUST_DECLTYPE_RETURNS(
+  THRUST_RETURNS(
     async_reduce_into(
       thrust::detail::derived_cast(thrust::detail::strip_const(exec))
     , THRUST_FWD(first), THRUST_FWD(last)
@@ -257,14 +283,15 @@ struct reduce_into_fn final
   , typename T
   >
   __host__
-  static auto call(
+  static auto call5(
     thrust::detail::execution_policy_base<DerivedPolicy> const& exec
   , ForwardIt&& first, Sentinel&& last
   , OutputIt&& output
   , T&& init
+  , thrust::true_type
   )
   // ADL dispatch.
-  THRUST_DECLTYPE_RETURNS(
+  THRUST_RETURNS(
     async_reduce_into(
       thrust::detail::derived_cast(thrust::detail::strip_const(exec))
     , THRUST_FWD(first), THRUST_FWD(last)
@@ -280,13 +307,14 @@ struct reduce_into_fn final
   >
   __host__
   static auto
-  call(
+  call4(
     thrust::detail::execution_policy_base<DerivedPolicy> const& exec
   , ForwardIt&& first, Sentinel&& last
   , OutputIt&& output
+  , thrust::true_type
   )
   // ADL dispatch.
-  THRUST_DECLTYPE_RETURNS(
+  THRUST_RETURNS(
     async_reduce_into(
       thrust::detail::derived_cast(thrust::detail::strip_const(exec))
     , THRUST_FWD(first), THRUST_FWD(last)
@@ -305,15 +333,15 @@ struct reduce_into_fn final
   , typename T, typename BinaryOp
   >
   __host__
-  static auto call(
+  static auto call5(
     ForwardIt&& first, Sentinel&& last
   , OutputIt&& output
   , T&& init
   , BinaryOp&& op
+  , thrust::false_type
   )
-  THRUST_DECLTYPE_RETURNS_WITH_SFINAE_CONDITION(
-    (negation<is_execution_policy<remove_cvref_t<ForwardIt>>>::value)
-  , reduce_into_fn::call(
+  THRUST_RETURNS(
+    reduce_into_fn::call(
       thrust::detail::select_system(
         typename iterator_system<remove_cvref_t<ForwardIt>>::type{}
       , typename iterator_system<remove_cvref_t<OutputIt>>::type{}
@@ -330,14 +358,14 @@ struct reduce_into_fn final
   , typename T
   >
   __host__
-  static auto call(
+  static auto call4(
     ForwardIt&& first, Sentinel&& last
   , OutputIt&& output
   , T&& init
+  , thrust::false_type
   )
-  THRUST_DECLTYPE_RETURNS_WITH_SFINAE_CONDITION(
-    (negation<is_execution_policy<remove_cvref_t<ForwardIt>>>::value)
-  , reduce_into_fn::call(
+  THRUST_RETURNS(
+    reduce_into_fn::call(
       thrust::detail::select_system(
         typename iterator_system<remove_cvref_t<ForwardIt>>::type{}
       , typename iterator_system<remove_cvref_t<OutputIt>>::type{}
@@ -357,7 +385,7 @@ struct reduce_into_fn final
     ForwardIt&& first, Sentinel&& last
   , OutputIt&& output
   )
-  THRUST_DECLTYPE_RETURNS(
+  THRUST_RETURNS(
     reduce_into_fn::call(
       thrust::detail::select_system(
         typename iterator_system<remove_cvref_t<ForwardIt>>::type{}
@@ -374,10 +402,31 @@ struct reduce_into_fn final
     )
   )
 
+  // MSVC WAR: MSVC gets angsty and eats all available RAM when we try to detect
+  // if T1 is an execution_policy by using SFINAE. Switching to a static
+  // dispatch pattern to prevent this.
+  template <typename T1, typename T2, typename T3, typename T4>
+  __host__
+  static auto call(T1&& t1, T2&& t2, T3&& t3, T4&& t4)
+  THRUST_RETURNS(
+    reduce_into_fn::call4(
+      THRUST_FWD(t1), THRUST_FWD(t2), THRUST_FWD(t3), THRUST_FWD(t4),
+      thrust::is_execution_policy<thrust::remove_cvref_t<T1>>{})
+  )
+
+  template <typename T1, typename T2, typename T3, typename T4, typename T5>
+  __host__
+  static auto call(T1&& t1, T2&& t2, T3&& t3, T4&& t4, T5&& t5)
+  THRUST_RETURNS(
+    reduce_into_fn::call5(
+      THRUST_FWD(t1), THRUST_FWD(t2), THRUST_FWD(t3), THRUST_FWD(t4),
+      THRUST_FWD(t5), thrust::is_execution_policy<thrust::remove_cvref_t<T1>>{})
+  )
+
   template <typename... Args>
-  THRUST_NODISCARD __host__ 
+  THRUST_NODISCARD __host__
   auto operator()(Args&&... args) const
-  THRUST_DECLTYPE_RETURNS(
+  THRUST_RETURNS(
     call(THRUST_FWD(args)...)
   )
 };
@@ -386,9 +435,12 @@ struct reduce_into_fn final
 
 THRUST_INLINE_CONSTANT reduce_into_detail::reduce_into_fn reduce_into{};
 
+/*! \endcond
+ */
+
 } // namespace async
 
-THRUST_END_NS
+THRUST_NAMESPACE_END
 
 #endif
 
diff --git a/thrust/async/scan.h b/thrust/async/scan.h
new file mode 100644
index 000000000..1bcf81257
--- /dev/null
+++ b/thrust/async/scan.h
@@ -0,0 +1,344 @@
+/*
+ *  Copyright 2008-2020 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file async/scan.h
+ *  \brief Functions for asynchronously computing prefix scans.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp14_required.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <thrust/detail/execution_policy.h>
+#include <thrust/detail/select_system.h>
+#include <thrust/detail/static_assert.h>
+
+#include <thrust/system/detail/adl/async/scan.h>
+
+#include <thrust/type_traits/is_execution_policy.h>
+#include <thrust/type_traits/logical_metafunctions.h>
+#include <thrust/type_traits/remove_cvref.h>
+
+#include <thrust/future.h>
+
+THRUST_NAMESPACE_BEGIN
+
+namespace async
+{
+
+// Fallback implementations used when no overloads are found via ADL:
+namespace unimplemented
+{
+
+template <typename DerivedPolicy,
+          typename ForwardIt,
+          typename Sentinel,
+          typename OutputIt,
+          typename BinaryOp>
+event<DerivedPolicy>
+async_inclusive_scan(thrust::execution_policy<DerivedPolicy>&,
+                     ForwardIt,
+                     Sentinel,
+                     OutputIt,
+                     BinaryOp)
+{
+  THRUST_STATIC_ASSERT_MSG(
+    (thrust::detail::depend_on_instantiation<ForwardIt, false>::value),
+    "this algorithm is not implemented for the specified system"
+  );
+  return {};
+}
+
+template <typename DerivedPolicy,
+          typename ForwardIt,
+          typename Sentinel,
+          typename OutputIt,
+          typename InitialValueType,
+          typename BinaryOp>
+event<DerivedPolicy>
+async_exclusive_scan(thrust::execution_policy<DerivedPolicy>&,
+                     ForwardIt,
+                     Sentinel,
+                     OutputIt,
+                     InitialValueType,
+                     BinaryOp)
+{
+  THRUST_STATIC_ASSERT_MSG(
+    (thrust::detail::depend_on_instantiation<ForwardIt, false>::value),
+    "this algorithm is not implemented for the specified system"
+  );
+  return {};
+}
+
+} // namespace unimplemented
+
+namespace inclusive_scan_detail
+{
+
+// Include fallback implementation for ADL failures
+using thrust::async::unimplemented::async_inclusive_scan;
+
+// Implementation of the thrust::async::inclusive_scan CPO.
+struct inclusive_scan_fn final
+{
+  template <typename DerivedPolicy,
+            typename ForwardIt,
+            typename Sentinel,
+            typename OutputIt,
+            typename BinaryOp>
+  auto
+  operator()(thrust::detail::execution_policy_base<DerivedPolicy> const& exec,
+             ForwardIt&& first,
+             Sentinel&& last,
+             OutputIt&& out,
+             BinaryOp&& op) const
+  // ADL dispatch.
+  THRUST_RETURNS(
+    async_inclusive_scan(
+      thrust::detail::derived_cast(thrust::detail::strip_const(exec)),
+      THRUST_FWD(first),
+      THRUST_FWD(last),
+      THRUST_FWD(out),
+      THRUST_FWD(op)
+    )
+  )
+
+  template <typename DerivedPolicy,
+            typename ForwardIt,
+            typename Sentinel,
+            typename OutputIt>
+  auto
+  operator()(thrust::detail::execution_policy_base<DerivedPolicy> const& exec,
+             ForwardIt&& first,
+             Sentinel&& last,
+             OutputIt&& out) const
+  // ADL dispatch.
+  THRUST_RETURNS(
+    async_inclusive_scan(
+      thrust::detail::derived_cast(thrust::detail::strip_const(exec)),
+      THRUST_FWD(first),
+      THRUST_FWD(last),
+      THRUST_FWD(out),
+      thrust::plus<>{}
+    )
+  )
+
+  template <typename ForwardIt,
+            typename Sentinel,
+            typename OutputIt,
+            typename BinaryOp,
+            typename = std::enable_if_t<!is_execution_policy_v<remove_cvref_t<ForwardIt>>>>
+  auto operator()(ForwardIt&& first,
+                  Sentinel&& last,
+                  OutputIt&& out,
+                  BinaryOp&& op) const
+  // ADL dispatch.
+  THRUST_RETURNS(
+    async_inclusive_scan(
+      thrust::detail::select_system(
+        iterator_system_t<remove_cvref_t<ForwardIt>>{},
+        iterator_system_t<remove_cvref_t<OutputIt>>{}
+      ),
+      THRUST_FWD(first),
+      THRUST_FWD(last),
+      THRUST_FWD(out),
+      THRUST_FWD(op)
+    )
+  )
+
+  template <typename ForwardIt, typename Sentinel, typename OutputIt>
+  auto operator()(ForwardIt&& first, Sentinel&& last, OutputIt&& out) const
+  // ADL dispatch.
+  THRUST_RETURNS(
+    async_inclusive_scan(
+      thrust::detail::select_system(
+        iterator_system_t<remove_cvref_t<ForwardIt>>{},
+        iterator_system_t<remove_cvref_t<OutputIt>>{}
+      ),
+      THRUST_FWD(first),
+      THRUST_FWD(last),
+      THRUST_FWD(out),
+      thrust::plus<>{}
+    )
+  )
+};
+
+} // namespace inclusive_scan_detail
+
+THRUST_INLINE_CONSTANT inclusive_scan_detail::inclusive_scan_fn inclusive_scan{};
+
+namespace exclusive_scan_detail
+{
+
+// Include fallback implementation for ADL failures
+using thrust::async::unimplemented::async_exclusive_scan;
+
+// Implementation of the thrust::async::exclusive_scan CPO.
+struct exclusive_scan_fn final
+{
+  template <typename DerivedPolicy,
+            typename ForwardIt,
+            typename Sentinel,
+            typename OutputIt,
+            typename InitialValueType,
+            typename BinaryOp>
+  auto
+  operator()(thrust::detail::execution_policy_base<DerivedPolicy> const& exec,
+             ForwardIt&& first,
+             Sentinel&& last,
+             OutputIt&& out,
+             InitialValueType&& init,
+             BinaryOp&& op) const
+  // ADL dispatch.
+  THRUST_RETURNS(
+    async_exclusive_scan(
+      thrust::detail::derived_cast(thrust::detail::strip_const(exec)),
+      THRUST_FWD(first),
+      THRUST_FWD(last),
+      THRUST_FWD(out),
+      THRUST_FWD(init),
+      THRUST_FWD(op)
+    )
+  )
+
+  template <typename DerivedPolicy,
+            typename ForwardIt,
+            typename Sentinel,
+            typename OutputIt,
+            typename InitialValueType>
+  auto
+  operator()(thrust::detail::execution_policy_base<DerivedPolicy> const& exec,
+             ForwardIt&& first,
+             Sentinel&& last,
+             OutputIt&& out,
+             InitialValueType&& init) const
+  // ADL dispatch.
+  THRUST_RETURNS(
+    async_exclusive_scan(
+      thrust::detail::derived_cast(thrust::detail::strip_const(exec)),
+      THRUST_FWD(first),
+      THRUST_FWD(last),
+      THRUST_FWD(out),
+      THRUST_FWD(init),
+      thrust::plus<>{}
+    )
+  )
+
+  template <typename DerivedPolicy,
+            typename ForwardIt,
+            typename Sentinel,
+            typename OutputIt>
+  auto
+  operator()(thrust::detail::execution_policy_base<DerivedPolicy> const& exec,
+             ForwardIt&& first,
+             Sentinel&& last,
+             OutputIt&& out) const
+  // ADL dispatch.
+  THRUST_RETURNS(
+    async_exclusive_scan(
+      thrust::detail::derived_cast(thrust::detail::strip_const(exec)),
+      THRUST_FWD(first),
+      THRUST_FWD(last),
+      THRUST_FWD(out),
+      iterator_value_t<remove_cvref_t<ForwardIt>>{},
+      thrust::plus<>{}
+    )
+  )
+
+  template <typename ForwardIt,
+            typename Sentinel,
+            typename OutputIt,
+            typename InitialValueType,
+            typename BinaryOp,
+            typename = std::enable_if_t<!is_execution_policy_v<remove_cvref_t<ForwardIt>>>>
+  auto
+  operator()(ForwardIt&& first,
+             Sentinel&& last,
+             OutputIt&& out,
+             InitialValueType&& init,
+             BinaryOp&& op) const
+  // ADL dispatch.
+  THRUST_RETURNS(
+    async_exclusive_scan(
+      thrust::detail::select_system(
+        iterator_system_t<remove_cvref_t<ForwardIt>>{},
+        iterator_system_t<remove_cvref_t<OutputIt>>{}
+      ),
+      THRUST_FWD(first),
+      THRUST_FWD(last),
+      THRUST_FWD(out),
+      THRUST_FWD(init),
+      THRUST_FWD(op)
+    )
+  )
+
+  template <typename ForwardIt,
+            typename Sentinel,
+            typename OutputIt,
+            typename InitialValueType,
+            typename = std::enable_if_t<!is_execution_policy_v<remove_cvref_t<ForwardIt>>>>
+  auto
+  operator()(ForwardIt&& first,
+             Sentinel&& last,
+             OutputIt&& out,
+             InitialValueType&& init) const
+  // ADL dispatch.
+  THRUST_RETURNS(
+    async_exclusive_scan(
+      thrust::detail::select_system(
+        iterator_system_t<remove_cvref_t<ForwardIt>>{},
+        iterator_system_t<remove_cvref_t<OutputIt>>{}
+      ),
+      THRUST_FWD(first),
+      THRUST_FWD(last),
+      THRUST_FWD(out),
+      THRUST_FWD(init),
+      thrust::plus<>{}
+    )
+  )
+
+  template <typename ForwardIt, typename Sentinel, typename OutputIt>
+  auto operator()(ForwardIt&& first,
+                  Sentinel&& last,
+                  OutputIt&& out) const
+  // ADL dispatch.
+  THRUST_RETURNS(
+    async_exclusive_scan(
+      thrust::detail::select_system(
+        iterator_system_t<remove_cvref_t<ForwardIt>>{},
+        iterator_system_t<remove_cvref_t<OutputIt>>{}
+      ),
+      THRUST_FWD(first),
+      THRUST_FWD(last),
+      THRUST_FWD(out),
+      iterator_value_t<remove_cvref_t<ForwardIt>>{},
+      thrust::plus<>{}
+    )
+  )
+};
+
+} // namespace exclusive_scan_detail
+
+THRUST_INLINE_CONSTANT exclusive_scan_detail::exclusive_scan_fn exclusive_scan{};
+
+} // namespace async
+
+THRUST_NAMESPACE_END
+
+#endif
diff --git a/thrust/async/sort.h b/thrust/async/sort.h
index 5a3ef067a..888179397 100644
--- a/thrust/async/sort.h
+++ b/thrust/async/sort.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2018 NVIDIA Corporation
+ *  Copyright 2008-2021 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -14,17 +14,16 @@
  *  limitations under the License.
  */
 
-/*! \file async/sort.h
- *  \brief Functions for asynchronously sorting a range.
+/*! \file
+ *  \brief Algorithms for asynchronously sorting a range.
  */
 
 #pragma once
 
 #include <thrust/detail/config.h>
-#include <thrust/detail/cpp11_required.h>
-#include <thrust/detail/modern_gcc_required.h>
+#include <thrust/detail/cpp14_required.h>
 
-#if THRUST_CPP_DIALECT >= 2011 && !defined(THRUST_LEGACY_GCC)
+#if THRUST_CPP_DIALECT >= 2014
 
 #include <thrust/detail/static_assert.h>
 #include <thrust/detail/select_system.h>
@@ -35,11 +34,14 @@
 
 #include <thrust/event.h>
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
 
 namespace async
 {
 
+/*! \cond
+ */
+
 namespace unimplemented
 {
 
@@ -47,10 +49,10 @@ template <
   typename DerivedPolicy
 , typename ForwardIt, typename Sentinel, typename StrictWeakOrdering
 >
-__host__ 
+__host__
 event<DerivedPolicy>
 async_stable_sort(
-  thrust::execution_policy<DerivedPolicy>& 
+  thrust::execution_policy<DerivedPolicy>&
 , ForwardIt, Sentinel, StrictWeakOrdering
 )
 {
@@ -59,7 +61,7 @@ async_stable_sort(
   , "this algorithm is not implemented for the specified system"
   );
   return {};
-} 
+}
 
 } // namespace unimplemented
 
@@ -74,14 +76,14 @@ struct stable_sort_fn final
     typename DerivedPolicy
   , typename ForwardIt, typename Sentinel, typename StrictWeakOrdering
   >
-  __host__ 
+  __host__
   static auto call(
     thrust::detail::execution_policy_base<DerivedPolicy> const& exec
   , ForwardIt&& first, Sentinel&& last
   , StrictWeakOrdering&& comp
   )
   // ADL dispatch.
-  THRUST_DECLTYPE_RETURNS(
+  THRUST_RETURNS(
     async_stable_sort(
       thrust::detail::derived_cast(thrust::detail::strip_const(exec))
     , THRUST_FWD(first), THRUST_FWD(last)
@@ -93,13 +95,13 @@ struct stable_sort_fn final
     typename DerivedPolicy
   , typename ForwardIt, typename Sentinel
   >
-  __host__ 
+  __host__
   static auto call(
     thrust::detail::execution_policy_base<DerivedPolicy> const& exec
   , ForwardIt&& first, Sentinel&& last
   )
   // ADL dispatch.
-  THRUST_DECLTYPE_RETURNS(
+  THRUST_RETURNS(
     async_stable_sort(
       thrust::detail::derived_cast(thrust::detail::strip_const(exec))
     , THRUST_FWD(first), THRUST_FWD(last)
@@ -110,9 +112,9 @@ struct stable_sort_fn final
   )
 
   template <typename ForwardIt, typename Sentinel, typename StrictWeakOrdering>
-  __host__ 
-  static auto call(ForwardIt&& first, Sentinel&& last, StrictWeakOrdering&& comp) 
-  THRUST_DECLTYPE_RETURNS(
+  __host__
+  static auto call(ForwardIt&& first, Sentinel&& last, StrictWeakOrdering&& comp)
+  THRUST_RETURNS(
     stable_sort_fn::call(
       thrust::detail::select_system(
         typename iterator_system<remove_cvref_t<ForwardIt>>::type{}
@@ -123,9 +125,9 @@ struct stable_sort_fn final
   )
 
   template <typename ForwardIt, typename Sentinel>
-  __host__ 
-  static auto call(ForwardIt&& first, Sentinel&& last) 
-  THRUST_DECLTYPE_RETURNS(
+  __host__
+  static auto call(ForwardIt&& first, Sentinel&& last)
+  THRUST_RETURNS(
     stable_sort_fn::call(
       THRUST_FWD(first), THRUST_FWD(last)
     , thrust::less<
@@ -135,9 +137,9 @@ struct stable_sort_fn final
   )
 
   template <typename... Args>
-  THRUST_NODISCARD __host__ 
+  THRUST_NODISCARD __host__
   auto operator()(Args&&... args) const
-  THRUST_DECLTYPE_RETURNS(
+  THRUST_RETURNS(
     call(THRUST_FWD(args)...)
   )
 };
@@ -153,7 +155,7 @@ template <
   typename DerivedPolicy
 , typename ForwardIt, typename Sentinel, typename StrictWeakOrdering
 >
-__host__ 
+__host__
 event<DerivedPolicy>
 async_sort(
   thrust::execution_policy<DerivedPolicy>& exec
@@ -164,7 +166,7 @@ async_sort(
     thrust::detail::derived_cast(exec)
   , THRUST_FWD(first), THRUST_FWD(last), THRUST_FWD(comp)
   );
-} 
+}
 
 } // namespace fallback
 
@@ -179,14 +181,14 @@ struct sort_fn final
     typename DerivedPolicy
   , typename ForwardIt, typename Sentinel, typename StrictWeakOrdering
   >
-  __host__ 
+  __host__
   static auto call(
     thrust::detail::execution_policy_base<DerivedPolicy> const& exec
   , ForwardIt&& first, Sentinel&& last
   , StrictWeakOrdering&& comp
   )
   // ADL dispatch.
-  THRUST_DECLTYPE_RETURNS(
+  THRUST_RETURNS(
     async_sort(
       thrust::detail::derived_cast(thrust::detail::strip_const(exec))
     , THRUST_FWD(first), THRUST_FWD(last)
@@ -198,12 +200,13 @@ struct sort_fn final
     typename DerivedPolicy
   , typename ForwardIt, typename Sentinel
   >
-  __host__ 
-  static auto call(
+  __host__
+  static auto call3(
     thrust::detail::execution_policy_base<DerivedPolicy> const& exec
   , ForwardIt&& first, Sentinel&& last
+  , thrust::true_type
   )
-  THRUST_DECLTYPE_RETURNS(
+  THRUST_RETURNS(
     sort_fn::call(
       exec
     , THRUST_FWD(first), THRUST_FWD(last)
@@ -214,11 +217,12 @@ struct sort_fn final
   )
 
   template <typename ForwardIt, typename Sentinel, typename StrictWeakOrdering>
-  __host__ 
-  static auto call(ForwardIt&& first, Sentinel&& last, StrictWeakOrdering&& comp) 
-  THRUST_DECLTYPE_RETURNS_WITH_SFINAE_CONDITION(
-    (negation<is_execution_policy<remove_cvref_t<ForwardIt>>>::value)
-  , sort_fn::call(
+  __host__
+  static auto call3(ForwardIt&& first, Sentinel&& last,
+                    StrictWeakOrdering&& comp,
+                    thrust::false_type)
+  THRUST_RETURNS(
+    sort_fn::call(
       thrust::detail::select_system(
         typename iterator_system<remove_cvref_t<ForwardIt>>::type{}
       )
@@ -227,10 +231,21 @@ struct sort_fn final
     )
   )
 
+  // MSVC WAR: MSVC gets angsty and eats all available RAM when we try to detect
+  // if T1 is an execution_policy by using SFINAE. Switching to a static
+  // dispatch pattern to prevent this.
+  template <typename T1, typename T2, typename T3>
+  __host__
+  static auto call(T1&& t1, T2&& t2, T3&& t3)
+  THRUST_RETURNS(
+    sort_fn::call3(THRUST_FWD(t1), THRUST_FWD(t2), THRUST_FWD(t3),
+                   thrust::is_execution_policy<thrust::remove_cvref_t<T1>>{})
+  )
+
   template <typename ForwardIt, typename Sentinel>
-  __host__ 
-  static auto call(ForwardIt&& first, Sentinel&& last) 
-  THRUST_DECLTYPE_RETURNS(
+  __host__
+  static auto call(ForwardIt&& first, Sentinel&& last)
+  THRUST_RETURNS(
     sort_fn::call(
       thrust::detail::select_system(
         typename iterator_system<remove_cvref_t<ForwardIt>>::type{}
@@ -243,9 +258,9 @@ struct sort_fn final
   )
 
   template <typename... Args>
-  THRUST_NODISCARD __host__ 
+  THRUST_NODISCARD __host__
   auto operator()(Args&&... args) const
-  THRUST_DECLTYPE_RETURNS(
+  THRUST_RETURNS(
     call(THRUST_FWD(args)...)
   )
 };
@@ -254,9 +269,12 @@ struct sort_fn final
 
 THRUST_INLINE_CONSTANT sort_detail::sort_fn sort{};
 
+/*! \endcond
+ */
+
 } // namespace async
 
-THRUST_END_NS
+THRUST_NAMESPACE_END
 
 #endif
 
diff --git a/thrust/async/transform.h b/thrust/async/transform.h
index 3e1391415..de72549bf 100644
--- a/thrust/async/transform.h
+++ b/thrust/async/transform.h
@@ -1,9 +1,9 @@
 /*
- *  Copyright 2008-2018 NVIDIA Corporation
+ *  Copyright 2008-2021 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
- *  You may obtain a transform of the License at
+ *  You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
@@ -14,17 +14,16 @@
  *  limitations under the License.
  */
 
-/*! \file async/transform.h
- *  \brief Functions for asynchronously transforming a range.
+/*! \file
+ *  \brief Algorithms for asynchronously transforming a range.
  */
 
 #pragma once
 
 #include <thrust/detail/config.h>
-#include <thrust/detail/cpp11_required.h>
-#include <thrust/detail/modern_gcc_required.h>
+#include <thrust/detail/cpp14_required.h>
 
-#if THRUST_CPP_DIALECT >= 2011 && !defined(THRUST_LEGACY_GCC)
+#if THRUST_CPP_DIALECT >= 2014
 
 #include <thrust/detail/static_assert.h>
 #include <thrust/detail/select_system.h>
@@ -33,11 +32,14 @@
 
 #include <thrust/event.h>
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
 
 namespace async
 {
 
+/*! \cond
+ */
+
 namespace unimplemented
 {
 
@@ -83,7 +85,7 @@ struct transform_fn final
   , UnaryOperation&& op
   )
   // ADL dispatch.
-  THRUST_DECLTYPE_RETURNS(
+  THRUST_RETURNS(
     async_transform(
       thrust::detail::derived_cast(thrust::detail::strip_const(exec))
     , THRUST_FWD(first), THRUST_FWD(last)
@@ -102,7 +104,7 @@ struct transform_fn final
   , OutputIt&& output
   , UnaryOperation&& op
   )
-  THRUST_DECLTYPE_RETURNS(
+  THRUST_RETURNS(
     transform_fn::call(
       thrust::detail::select_system(
         typename iterator_system<remove_cvref_t<ForwardIt>>::type{}
@@ -117,7 +119,7 @@ struct transform_fn final
   template <typename... Args>
   THRUST_NODISCARD __host__
   auto operator()(Args&&... args) const
-  THRUST_DECLTYPE_RETURNS(
+  THRUST_RETURNS(
     call(THRUST_FWD(args)...)
   )
 };
@@ -126,9 +128,11 @@ struct transform_fn final
 
 THRUST_INLINE_CONSTANT transform_detail::transform_fn transform{};
 
+/*! \endcond
+ */
+
 } // namespace async
 
-THRUST_END_NS
+THRUST_NAMESPACE_END
 
 #endif
-
diff --git a/thrust/binary_search.h b/thrust/binary_search.h
index 127be16aa..7a4746e0b 100644
--- a/thrust/binary_search.h
+++ b/thrust/binary_search.h
@@ -25,10 +25,8 @@
 #include <thrust/detail/execution_policy.h>
 #include <thrust/pair.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
-    
 /*! \addtogroup algorithms
  */
 
@@ -67,8 +65,8 @@ namespace thrust
  *  \return The furthermost iterator \c i, such that <tt>*i < value</tt>.
  * 
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
- *  \tparam LessThanComparable is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>. 
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>.
+ *  \tparam LessThanComparable is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThanComparable</a>. 
  *
  *  The following code snippet demonstrates how to use \p lower_bound
  *  to search for values in a ordered range using the \p thrust::device execution policy for parallelization:
@@ -94,7 +92,7 @@ namespace thrust
  *  thrust::lower_bound(thrust::device, input.begin(), input.end(), 9); // returns input.end()
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/lower_bound.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/lower_bound
  *  \see \p upper_bound
  *  \see \p equal_range
  *  \see \p binary_search
@@ -120,8 +118,8 @@ ForwardIterator lower_bound(const thrust::detail::execution_policy_base<DerivedP
  *  \param value The value to be searched.
  *  \return The furthermost iterator \c i, such that <tt>*i < value</tt>.
  * 
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
- *  \tparam LessThanComparable is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>. 
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>.
+ *  \tparam LessThanComparable is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThanComparable</a>. 
  *
  *  The following code snippet demonstrates how to use \p lower_bound
  *  to search for values in a ordered range.
@@ -146,7 +144,7 @@ ForwardIterator lower_bound(const thrust::detail::execution_policy_base<DerivedP
  *  thrust::lower_bound(input.begin(), input.end(), 9); // returns input.end()
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/lower_bound.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/lower_bound
  *  \see \p upper_bound
  *  \see \p equal_range
  *  \see \p binary_search
@@ -176,9 +174,9 @@ ForwardIterator lower_bound(ForwardIterator first,
  *  \return The furthermost iterator \c i, such that <tt>comp(*i, value)</tt> is \c true.
  * 
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>.
  *  \tparam T is comparable to \p ForwardIterator's \c value_type.
- *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam StrictWeakOrdering is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  The following code snippet demonstrates how to use \p lower_bound
  *  to search for values in a ordered range using the \p thrust::device execution policy for parallelization:
@@ -205,7 +203,7 @@ ForwardIterator lower_bound(ForwardIterator first,
  *  thrust::lower_bound(input.begin(), input.end(), 9, thrust::less<int>()); // returns input.end()
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/lower_bound.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/lower_bound
  *  \see \p upper_bound
  *  \see \p equal_range
  *  \see \p binary_search
@@ -234,9 +232,9 @@ ForwardIterator lower_bound(const thrust::detail::execution_policy_base<DerivedP
  *  \param comp The comparison operator.
  *  \return The furthermost iterator \c i, such that <tt>comp(*i, value)</tt> is \c true.
  * 
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>.
  *  \tparam T is comparable to \p ForwardIterator's \c value_type.
- *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam StrictWeakOrdering is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  The following code snippet demonstrates how to use \p lower_bound
  *  to search for values in a ordered range.
@@ -262,7 +260,7 @@ ForwardIterator lower_bound(const thrust::detail::execution_policy_base<DerivedP
  *  thrust::lower_bound(input.begin(), input.end(), 9, thrust::less<int>()); // returns input.end()
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/lower_bound.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/lower_bound
  *  \see \p upper_bound
  *  \see \p equal_range
  *  \see \p binary_search
@@ -292,8 +290,8 @@ ForwardIterator lower_bound(ForwardIterator first,
  *  \return The furthermost iterator \c i, such that <tt>value < *i</tt> is \c false.
  * 
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
- *  \tparam LessThanComparable is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>. 
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>.
+ *  \tparam LessThanComparable is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThanComparable</a>. 
  *
  *  The following code snippet demonstrates how to use \p upper_bound
  *  to search for values in a ordered range using the \p thrust::device execution policy for parallelism:
@@ -319,7 +317,7 @@ ForwardIterator lower_bound(ForwardIterator first,
  *  thrust::upper_bound(thrust::device, input.begin(), input.end(), 9); // returns input.end()
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/upper_bound.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/upper_bound
  *  \see \p lower_bound
  *  \see \p equal_range
  *  \see \p binary_search
@@ -346,8 +344,8 @@ ForwardIterator upper_bound(const thrust::detail::execution_policy_base<DerivedP
  *  \param value The value to be searched.
  *  \return The furthermost iterator \c i, such that <tt>value < *i</tt> is \c false.
  * 
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
- *  \tparam LessThanComparable is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>. 
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>.
+ *  \tparam LessThanComparable is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThanComparable</a>. 
  *
  *  The following code snippet demonstrates how to use \p upper_bound
  *  to search for values in a ordered range.
@@ -372,7 +370,7 @@ ForwardIterator upper_bound(const thrust::detail::execution_policy_base<DerivedP
  *  thrust::upper_bound(input.begin(), input.end(), 9); // returns input.end()
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/upper_bound.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/upper_bound
  *  \see \p lower_bound
  *  \see \p equal_range
  *  \see \p binary_search
@@ -402,9 +400,9 @@ ForwardIterator upper_bound(ForwardIterator first,
  *  \return The furthermost iterator \c i, such that <tt>comp(value, *i)</tt> is \c false.
  * 
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>.
  *  \tparam T is comparable to \p ForwardIterator's \c value_type.
- *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam StrictWeakOrdering is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  The following code snippet demonstrates how to use \p upper_bound
  *  to search for values in a ordered range using the \p thrust::device execution policy for parallelization:
@@ -431,7 +429,7 @@ ForwardIterator upper_bound(ForwardIterator first,
  *  thrust::upper_bound(thrust::device, input.begin(), input.end(), 9, thrust::less<int>()); // returns input.end()
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/upper_bound.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/upper_bound
  *  \see \p lower_bound
  *  \see \p equal_range
  *  \see \p binary_search
@@ -459,9 +457,9 @@ ForwardIterator upper_bound(const thrust::detail::execution_policy_base<DerivedP
  *  \param comp The comparison operator.
  *  \return The furthermost iterator \c i, such that <tt>comp(value, *i)</tt> is \c false.
  * 
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>.
  *  \tparam T is comparable to \p ForwardIterator's \c value_type.
- *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam StrictWeakOrdering is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  The following code snippet demonstrates how to use \p upper_bound
  *  to search for values in a ordered range.
@@ -487,7 +485,7 @@ ForwardIterator upper_bound(const thrust::detail::execution_policy_base<DerivedP
  *  thrust::upper_bound(input.begin(), input.end(), 9, thrust::less<int>()); // returns input.end()
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/upper_bound.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/upper_bound
  *  \see \p lower_bound
  *  \see \p equal_range
  *  \see \p binary_search
@@ -516,8 +514,8 @@ ForwardIterator upper_bound(ForwardIterator first,
  *  \return \c true if an equivalent element exists in <tt>[first, last)</tt>, otherwise \c false.
  * 
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
- *  \tparam LessThanComparable is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>. 
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>.
+ *  \tparam LessThanComparable is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThanComparable</a>. 
  *
  *  The following code snippet demonstrates how to use \p binary_search
  *  to search for values in a ordered range using the \p thrust::device execution policy for parallelization:
@@ -543,7 +541,7 @@ ForwardIterator upper_bound(ForwardIterator first,
  *  thrust::binary_search(thrust::device, input.begin(), input.end(), 9); // returns false
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/binary_search.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/binary_search
  *  \see \p lower_bound
  *  \see \p upper_bound
  *  \see \p equal_range
@@ -569,8 +567,8 @@ bool binary_search(const thrust::detail::execution_policy_base<DerivedPolicy> &e
  *  \param value The value to be searched.
  *  \return \c true if an equivalent element exists in <tt>[first, last)</tt>, otherwise \c false.
  * 
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
- *  \tparam LessThanComparable is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>. 
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>.
+ *  \tparam LessThanComparable is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThanComparable</a>. 
  *
  *  The following code snippet demonstrates how to use \p binary_search
  *  to search for values in a ordered range.
@@ -595,7 +593,7 @@ bool binary_search(const thrust::detail::execution_policy_base<DerivedPolicy> &e
  *  thrust::binary_search(input.begin(), input.end(), 9); // returns false
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/binary_search.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/binary_search
  *  \see \p lower_bound
  *  \see \p upper_bound
  *  \see \p equal_range
@@ -624,9 +622,9 @@ bool binary_search(ForwardIterator first,
  *  \return \c true if an equivalent element exists in <tt>[first, last)</tt>, otherwise \c false.
  * 
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>.
  *  \tparam T is comparable to \p ForwardIterator's \c value_type.
- *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam StrictWeakOrdering is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  The following code snippet demonstrates how to use \p binary_search
  *  to search for values in a ordered range using the \p thrust::device execution policy for parallelization:
@@ -653,7 +651,7 @@ bool binary_search(ForwardIterator first,
  *  thrust::binary_search(thrust::device, input.begin(), input.end(), 9, thrust::less<int>()); // returns false
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/binary_search.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/binary_search
  *  \see \p lower_bound
  *  \see \p upper_bound
  *  \see \p equal_range
@@ -681,9 +679,9 @@ bool binary_search(const thrust::detail::execution_policy_base<DerivedPolicy> &e
  *  \param comp The comparison operator.
  *  \return \c true if an equivalent element exists in <tt>[first, last)</tt>, otherwise \c false.
  * 
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>.
  *  \tparam T is comparable to \p ForwardIterator's \c value_type.
- *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam StrictWeakOrdering is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  The following code snippet demonstrates how to use \p binary_search
  *  to search for values in a ordered range.
@@ -709,7 +707,7 @@ bool binary_search(const thrust::detail::execution_policy_base<DerivedPolicy> &e
  *  thrust::binary_search(input.begin(), input.end(), 9, thrust::less<int>()); // returns false
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/binary_search.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/binary_search
  *  \see \p lower_bound
  *  \see \p upper_bound
  *  \see \p equal_range
@@ -751,8 +749,8 @@ bool binary_search(ForwardIterator first,
  *  \return A \p pair of iterators <tt>[i, j)</tt> that define the range of equivalent elements.
  * 
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
- *  \tparam LessThanComparable is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>. 
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>.
+ *  \tparam LessThanComparable is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThanComparable</a>. 
  *
  *  The following code snippet demonstrates how to use \p equal_range
  *  to search for values in a ordered range using the \p thrust::device execution policy for parallelization:
@@ -778,7 +776,7 @@ bool binary_search(ForwardIterator first,
  *  thrust::equal_range(thrust::device, input.begin(), input.end(), 9); // returns [input.end(), input.end)
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/equal_range.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/equal_range
  *  \see \p lower_bound
  *  \see \p upper_bound
  *  \see \p binary_search
@@ -818,8 +816,8 @@ equal_range(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
  *  \param value The value to be searched.
  *  \return A \p pair of iterators <tt>[i, j)</tt> that define the range of equivalent elements.
  * 
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
- *  \tparam LessThanComparable is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>. 
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>.
+ *  \tparam LessThanComparable is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThanComparable</a>. 
  *
  *  The following code snippet demonstrates how to use \p equal_range
  *  to search for values in a ordered range.
@@ -844,7 +842,7 @@ equal_range(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
  *  thrust::equal_range(input.begin(), input.end(), 9); // returns [input.end(), input.end)
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/equal_range.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/equal_range
  *  \see \p lower_bound
  *  \see \p upper_bound
  *  \see \p binary_search
@@ -888,9 +886,9 @@ equal_range(ForwardIterator first,
  *  \return A \p pair of iterators <tt>[i, j)</tt> that define the range of equivalent elements.
  * 
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>.
  *  \tparam T is comparable to \p ForwardIterator's \c value_type.
- *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam StrictWeakOrdering is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  The following code snippet demonstrates how to use \p equal_range
  *  to search for values in a ordered range using the \p thrust::device execution policy for parallelization:
@@ -917,7 +915,7 @@ equal_range(ForwardIterator first,
  *  thrust::equal_range(thrust::device, input.begin(), input.end(), 9, thrust::less<int>()); // returns [input.end(), input.end)
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/equal_range.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/equal_range
  *  \see \p lower_bound
  *  \see \p upper_bound
  *  \see \p binary_search
@@ -960,9 +958,9 @@ equal_range(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
  *  \param comp The comparison operator.
  *  \return A \p pair of iterators <tt>[i, j)</tt> that define the range of equivalent elements.
  * 
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>.
  *  \tparam T is comparable to \p ForwardIterator's \c value_type.
- *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam StrictWeakOrdering is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  The following code snippet demonstrates how to use \p equal_range
  *  to search for values in a ordered range.
@@ -988,7 +986,7 @@ equal_range(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
  *  thrust::equal_range(input.begin(), input.end(), 9, thrust::less<int>()); // returns [input.end(), input.end)
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/equal_range.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/equal_range
  *  \see \p lower_bound
  *  \see \p upper_bound
  *  \see \p binary_search
@@ -1028,10 +1026,10 @@ equal_range(ForwardIterator first,
  *  \param result The beginning of the output sequence.
  * 
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *                        and \c InputIterator's \c value_type is <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>.
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>.
+ *                        and \c InputIterator's \c value_type is <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThanComparable</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *                        and \c ForwardIterator's difference_type is convertible to \c OutputIterator's \c value_type.
  *
  *  \pre The ranges <tt>[first,last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
@@ -1071,7 +1069,7 @@ equal_range(ForwardIterator first,
  *  // output is now [0, 1, 1, 2, 4, 5]
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/lower_bound.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/lower_bound
  *  \see \p upper_bound
  *  \see \p equal_range
  *  \see \p binary_search
@@ -1098,10 +1096,10 @@ OutputIterator lower_bound(const thrust::detail::execution_policy_base<DerivedPo
  *  \param values_last The end of the search values sequence.
  *  \param result The beginning of the output sequence.
  * 
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *                        and \c InputIterator's \c value_type is <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>.
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>.
+ *                        and \c InputIterator's \c value_type is <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThanComparable</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *                        and \c ForwardIterator's difference_type is convertible to \c OutputIterator's \c value_type.
  *
  *  \pre The ranges <tt>[first,last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
@@ -1138,7 +1136,7 @@ OutputIterator lower_bound(const thrust::detail::execution_policy_base<DerivedPo
  *  // output is now [0, 1, 1, 2, 4, 5]
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/lower_bound.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/lower_bound
  *  \see \p upper_bound
  *  \see \p equal_range
  *  \see \p binary_search
@@ -1169,12 +1167,12 @@ OutputIterator lower_bound(ForwardIterator first,
  *  \param comp The comparison operator.
  * 
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>.
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>.
  *                        and \c InputIterator's \c value_type is comparable to \p ForwardIterator's \c value_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *                        and \c ForwardIterator's difference_type is convertible to \c OutputIterator's \c value_type.
- *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam StrictWeakOrdering is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  \pre The ranges <tt>[first,last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
  *
@@ -1213,7 +1211,7 @@ OutputIterator lower_bound(ForwardIterator first,
  *  // output is now [0, 1, 1, 2, 4, 5]
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/lower_bound.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/lower_bound
  *  \see \p upper_bound
  *  \see \p equal_range
  *  \see \p binary_search
@@ -1243,12 +1241,12 @@ OutputIterator lower_bound(const thrust::detail::execution_policy_base<DerivedPo
  *  \param result The beginning of the output sequence.
  *  \param comp The comparison operator.
  * 
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>.
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>.
  *                        and \c InputIterator's \c value_type is comparable to \p ForwardIterator's \c value_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *                        and \c ForwardIterator's difference_type is convertible to \c OutputIterator's \c value_type.
- *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam StrictWeakOrdering is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  \pre The ranges <tt>[first,last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
  *
@@ -1286,7 +1284,7 @@ OutputIterator lower_bound(const thrust::detail::execution_policy_base<DerivedPo
  *  // output is now [0, 1, 1, 2, 4, 5]
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/lower_bound.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/lower_bound
  *  \see \p upper_bound
  *  \see \p equal_range
  *  \see \p binary_search
@@ -1316,10 +1314,10 @@ OutputIterator lower_bound(ForwardIterator first,
  *  \param result The beginning of the output sequence.
  * 
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *                        and \c InputIterator's \c value_type is <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>.
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>.
+ *                        and \c InputIterator's \c value_type is <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThanComparable</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *                        and \c ForwardIterator's difference_type is convertible to \c OutputIterator's \c value_type.
  *
  *  \pre The ranges <tt>[first,last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
@@ -1359,7 +1357,7 @@ OutputIterator lower_bound(ForwardIterator first,
  *  // output is now [1, 1, 2, 2, 5, 5]
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/upper_bound.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/upper_bound
  *  \see \p upper_bound
  *  \see \p equal_range
  *  \see \p binary_search
@@ -1386,10 +1384,10 @@ OutputIterator upper_bound(const thrust::detail::execution_policy_base<DerivedPo
  *  \param values_last The end of the search values sequence.
  *  \param result The beginning of the output sequence.
  * 
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *                        and \c InputIterator's \c value_type is <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>.
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>.
+ *                        and \c InputIterator's \c value_type is <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThanComparable</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *                        and \c ForwardIterator's difference_type is convertible to \c OutputIterator's \c value_type.
  *
  *  \pre The ranges <tt>[first,last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
@@ -1426,7 +1424,7 @@ OutputIterator upper_bound(const thrust::detail::execution_policy_base<DerivedPo
  *  // output is now [1, 1, 2, 2, 5, 5]
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/upper_bound.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/upper_bound
  *  \see \p upper_bound
  *  \see \p equal_range
  *  \see \p binary_search
@@ -1457,12 +1455,12 @@ OutputIterator upper_bound(ForwardIterator first,
  *  \param comp The comparison operator.
  * 
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>.
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>.
  *                        and \c InputIterator's \c value_type is comparable to \p ForwardIterator's \c value_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *                        and \c ForwardIterator's difference_type is convertible to \c OutputIterator's \c value_type.
- *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam StrictWeakOrdering is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  \pre The ranges <tt>[first,last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
  *
@@ -1503,7 +1501,7 @@ OutputIterator upper_bound(ForwardIterator first,
  *  // output is now [1, 1, 2, 2, 5, 5]
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/upper_bound.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/upper_bound
  *  \see \p lower_bound
  *  \see \p equal_range
  *  \see \p binary_search
@@ -1533,12 +1531,12 @@ OutputIterator upper_bound(const thrust::detail::execution_policy_base<DerivedPo
  *  \param result The beginning of the output sequence.
  *  \param comp The comparison operator.
  * 
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>.
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>.
  *                        and \c InputIterator's \c value_type is comparable to \p ForwardIterator's \c value_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *                        and \c ForwardIterator's difference_type is convertible to \c OutputIterator's \c value_type.
- *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam StrictWeakOrdering is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  \pre The ranges <tt>[first,last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
  *
@@ -1576,7 +1574,7 @@ OutputIterator upper_bound(const thrust::detail::execution_policy_base<DerivedPo
  *  // output is now [1, 1, 2, 2, 5, 5]
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/upper_bound.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/upper_bound
  *  \see \p lower_bound
  *  \see \p equal_range
  *  \see \p binary_search
@@ -1607,10 +1605,10 @@ OutputIterator upper_bound(ForwardIterator first,
  *  \param result The beginning of the output sequence.
  * 
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *                        and \c InputIterator's \c value_type is <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>.
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>.
+ *                        and \c InputIterator's \c value_type is <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThanComparable</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *                        and bool is convertible to \c OutputIterator's \c value_type.
  *
  *  \pre The ranges <tt>[first,last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
@@ -1650,7 +1648,7 @@ OutputIterator upper_bound(ForwardIterator first,
  *  // output is now [true, false, true, false, true, false]
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/binary_search.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/binary_search
  *  \see \p lower_bound
  *  \see \p upper_bound
  *  \see \p equal_range
@@ -1678,10 +1676,10 @@ OutputIterator binary_search(const thrust::detail::execution_policy_base<Derived
  *  \param values_last The end of the search values sequence.
  *  \param result The beginning of the output sequence.
  * 
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *                        and \c InputIterator's \c value_type is <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>.
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>.
+ *                        and \c InputIterator's \c value_type is <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThanComparable</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *                        and bool is convertible to \c OutputIterator's \c value_type.
  *
  *  \pre The ranges <tt>[first,last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
@@ -1718,7 +1716,7 @@ OutputIterator binary_search(const thrust::detail::execution_policy_base<Derived
  *  // output is now [true, false, true, false, true, false]
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/binary_search.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/binary_search
  *  \see \p lower_bound
  *  \see \p upper_bound
  *  \see \p equal_range
@@ -1750,12 +1748,12 @@ OutputIterator binary_search(ForwardIterator first,
  *  \param comp The comparison operator.
  * 
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *                        and \c InputIterator's \c value_type is <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>.
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>.
+ *                        and \c InputIterator's \c value_type is <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThanComparable</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *                        and bool is convertible to \c OutputIterator's \c value_type.
- *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam StrictWeakOrdering is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  \pre The ranges <tt>[first,last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
  *
@@ -1796,7 +1794,7 @@ OutputIterator binary_search(ForwardIterator first,
  *  // output is now [true, false, true, false, true, false]
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/binary_search.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/binary_search
  *  \see \p lower_bound
  *  \see \p upper_bound
  *  \see \p equal_range
@@ -1827,12 +1825,12 @@ OutputIterator binary_search(const thrust::detail::execution_policy_base<Derived
  *  \param result The beginning of the output sequence.
  *  \param comp The comparison operator.
  * 
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *                        and \c InputIterator's \c value_type is <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>.
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>.
+ *                        and \c InputIterator's \c value_type is <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThanComparable</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *                        and bool is convertible to \c OutputIterator's \c value_type.
- *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam StrictWeakOrdering is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  \pre The ranges <tt>[first,last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
  *
@@ -1870,7 +1868,7 @@ OutputIterator binary_search(const thrust::detail::execution_policy_base<Derived
  *  // output is now [true, false, true, false, true, false]
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/binary_search.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/binary_search
  *  \see \p lower_bound
  *  \see \p upper_bound
  *  \see \p equal_range
@@ -1895,8 +1893,7 @@ OutputIterator binary_search(ForwardIterator first,
 /*! \} // end searching
  */
 
-
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/binary_search.inl>
 
diff --git a/thrust/cmake/FindTBB.cmake b/thrust/cmake/FindTBB.cmake
new file mode 100644
index 000000000..01e53d5e7
--- /dev/null
+++ b/thrust/cmake/FindTBB.cmake
@@ -0,0 +1,446 @@
+# - Find ThreadingBuildingBlocks include dirs and libraries
+# Use this module by invoking find_package with the form:
+#  find_package(TBB
+#    [REQUIRED]             # Fail with error if TBB is not found
+#    )                      #
+# Once done, this will define
+#
+#  TBB_FOUND - system has TBB
+#  TBB_INCLUDE_DIRS - the TBB include directories
+#  TBB_LIBRARIES - TBB libraries to be lined, doesn't include malloc or
+#                  malloc proxy
+#  TBB::tbb - imported target for the TBB library
+#
+#  TBB_VERSION - Product Version Number ("MAJOR.MINOR")
+#  TBB_VERSION_MAJOR - Major Product Version Number
+#  TBB_VERSION_MINOR - Minor Product Version Number
+#  TBB_INTERFACE_VERSION - Engineering Focused Version Number
+#  TBB_COMPATIBLE_INTERFACE_VERSION - The oldest major interface version
+#                                     still supported. This uses the engineering
+#                                     focused interface version numbers.
+#
+#  TBB_MALLOC_FOUND - system has TBB malloc library
+#  TBB_MALLOC_INCLUDE_DIRS - the TBB malloc include directories
+#  TBB_MALLOC_LIBRARIES - The TBB malloc libraries to be lined
+#  TBB::malloc - imported target for the TBB malloc library
+#
+#  TBB_MALLOC_PROXY_FOUND - system has TBB malloc proxy library
+#  TBB_MALLOC_PROXY_INCLUDE_DIRS = the TBB malloc proxy include directories
+#  TBB_MALLOC_PROXY_LIBRARIES - The TBB malloc proxy libraries to be lined
+#  TBB::malloc_proxy - imported target for the TBB malloc proxy library
+#
+#
+# This module reads hints about search locations from variables:
+#  ENV TBB_ARCH_PLATFORM - for eg. set it to "mic" for Xeon Phi builds
+#  ENV TBB_ROOT or just TBB_ROOT - root directory of tbb installation
+#  ENV TBB_BUILD_PREFIX - specifies the build prefix for user built tbb
+#                         libraries. Should be specified with ENV TBB_ROOT
+#                         and optionally...
+#  ENV TBB_BUILD_DIR - if build directory is different than ${TBB_ROOT}/build
+#
+#
+# Modified by Robert Maynard from the original OGRE source
+#
+#-------------------------------------------------------------------
+# This file is part of the CMake build system for OGRE
+#     (Object-oriented Graphics Rendering Engine)
+# For the latest info, see http://www.ogre3d.org/
+#
+# The contents of this file are placed in the public domain. Feel
+# free to make use of it in any way you like.
+#-------------------------------------------------------------------
+#
+#=============================================================================
+# Copyright 2010-2012 Kitware, Inc.
+# Copyright 2012      Rolf Eike Beer <eike@sf-mail.de>
+#
+# Distributed under the OSI-approved BSD License (the "License");
+# see accompanying file Copyright.txt for details.
+#
+# This software is distributed WITHOUT ANY WARRANTY; without even the
+# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# See the License for more information.
+#=============================================================================
+# (To distribute this file outside of CMake, substitute the full
+#  License text for the above reference.)
+
+
+#=============================================================================
+#  FindTBB helper functions and macros
+#
+
+#====================================================
+# Fix the library path in case it is a linker script
+#====================================================
+function(tbb_extract_real_library library real_library)
+  if(NOT UNIX OR NOT EXISTS ${library})
+    set(${real_library} "${library}" PARENT_SCOPE)
+    return()
+  endif()
+
+  #Read in the first 4 bytes and see if they are the ELF magic number
+  set(_elf_magic "7f454c46")
+  file(READ ${library} _hex_data OFFSET 0 LIMIT 4 HEX)
+  if(_hex_data STREQUAL _elf_magic)
+    #we have opened a elf binary so this is what
+    #we should link to
+    set(${real_library} "${library}" PARENT_SCOPE)
+    return()
+  endif()
+
+  file(READ ${library} _data OFFSET 0 LIMIT 1024)
+  if("${_data}" MATCHES "INPUT \\(([^(]+)\\)")
+    #extract out the .so name from REGEX MATCH command
+    set(_proper_so_name "${CMAKE_MATCH_1}")
+
+    #construct path to the real .so which is presumed to be in the same directory
+    #as the input file
+    get_filename_component(_so_dir "${library}" DIRECTORY)
+    set(${real_library} "${_so_dir}/${_proper_so_name}" PARENT_SCOPE)
+  else()
+    #unable to determine what this library is so just hope everything works
+    #and pass it unmodified.
+    set(${real_library} "${library}" PARENT_SCOPE)
+  endif()
+endfunction()
+
+#===============================================
+# Do the final processing for the package find.
+#===============================================
+macro(findpkg_finish PREFIX TARGET_NAME)
+  if (${PREFIX}_INCLUDE_DIR AND ${PREFIX}_LIBRARY)
+    set(${PREFIX}_FOUND TRUE)
+    set (${PREFIX}_INCLUDE_DIRS ${${PREFIX}_INCLUDE_DIR})
+    set (${PREFIX}_LIBRARIES ${${PREFIX}_LIBRARY})
+  else ()
+    if (${PREFIX}_FIND_REQUIRED)
+      message(FATAL_ERROR "Required library ${PREFIX} not found.")
+    elseif (NOT ${PREFIX}_FIND_QUIETLY)
+      message("Library ${PREFIX} not found.")
+    endif()
+    return()
+  endif ()
+
+  if (NOT TARGET "TBB::${TARGET_NAME}")
+    if (${PREFIX}_LIBRARY_RELEASE)
+      tbb_extract_real_library(${${PREFIX}_LIBRARY_RELEASE} real_release)
+    endif ()
+    if (${PREFIX}_LIBRARY_DEBUG)
+      tbb_extract_real_library(${${PREFIX}_LIBRARY_DEBUG} real_debug)
+    endif ()
+    add_library(TBB::${TARGET_NAME} UNKNOWN IMPORTED)
+    set_target_properties(TBB::${TARGET_NAME} PROPERTIES
+      INTERFACE_INCLUDE_DIRECTORIES "${${PREFIX}_INCLUDE_DIR}")
+    if (${PREFIX}_LIBRARY_DEBUG AND ${PREFIX}_LIBRARY_RELEASE)
+      set_target_properties(TBB::${TARGET_NAME} PROPERTIES
+        IMPORTED_LOCATION "${real_release}"
+        IMPORTED_LOCATION_DEBUG "${real_debug}"
+        IMPORTED_LOCATION_RELEASE "${real_release}")
+    elseif (${PREFIX}_LIBRARY_RELEASE)
+      set_target_properties(TBB::${TARGET_NAME} PROPERTIES
+        IMPORTED_LOCATION "${real_release}")
+    elseif (${PREFIX}_LIBRARY_DEBUG)
+      set_target_properties(TBB::${TARGET_NAME} PROPERTIES
+        IMPORTED_LOCATION "${real_debug}")
+    endif ()
+  endif ()
+
+  #mark the following variables as internal variables
+  mark_as_advanced(${PREFIX}_INCLUDE_DIR
+                   ${PREFIX}_LIBRARY
+                   ${PREFIX}_LIBRARY_DEBUG
+                   ${PREFIX}_LIBRARY_RELEASE)
+endmacro()
+
+#===============================================
+# Generate debug names from given release names
+#===============================================
+macro(get_debug_names PREFIX)
+  foreach(i ${${PREFIX}})
+    set(${PREFIX}_DEBUG ${${PREFIX}_DEBUG} ${i}d ${i}D ${i}_d ${i}_D ${i}_debug ${i})
+  endforeach()
+endmacro()
+
+#===============================================
+# See if we have env vars to help us find tbb
+#===============================================
+macro(getenv_path VAR)
+   set(ENV_${VAR} $ENV{${VAR}})
+   # replace won't work if var is blank
+   if (ENV_${VAR})
+     string( REGEX REPLACE "\\\\" "/" ENV_${VAR} ${ENV_${VAR}} )
+   endif ()
+endmacro()
+
+#===============================================
+# Couple a set of release AND debug libraries
+#===============================================
+macro(make_library_set PREFIX)
+  if (${PREFIX}_RELEASE AND ${PREFIX}_DEBUG)
+    set(${PREFIX} optimized ${${PREFIX}_RELEASE} debug ${${PREFIX}_DEBUG})
+  elseif (${PREFIX}_RELEASE)
+    set(${PREFIX} ${${PREFIX}_RELEASE})
+  elseif (${PREFIX}_DEBUG)
+    set(${PREFIX} ${${PREFIX}_DEBUG})
+  endif ()
+endmacro()
+
+
+#=============================================================================
+#  Now to actually find TBB
+#
+
+# Get path, convert backslashes as ${ENV_${var}}
+getenv_path(TBB_ROOT)
+
+# initialize search paths
+set(TBB_PREFIX_PATH ${TBB_ROOT} ${ENV_TBB_ROOT})
+set(TBB_INC_SEARCH_PATH "")
+set(TBB_LIB_SEARCH_PATH "")
+
+
+# If user built from sources
+set(TBB_BUILD_PREFIX $ENV{TBB_BUILD_PREFIX})
+if (TBB_BUILD_PREFIX AND ENV_TBB_ROOT)
+  getenv_path(TBB_BUILD_DIR)
+  if (NOT ENV_TBB_BUILD_DIR)
+    set(ENV_TBB_BUILD_DIR ${ENV_TBB_ROOT}/build)
+  endif ()
+
+  # include directory under ${ENV_TBB_ROOT}/include
+  list(APPEND TBB_LIB_SEARCH_PATH
+    ${ENV_TBB_BUILD_DIR}/${TBB_BUILD_PREFIX}_release
+    ${ENV_TBB_BUILD_DIR}/${TBB_BUILD_PREFIX}_debug)
+endif ()
+
+
+# For Windows, let's assume that the user might be using the precompiled
+# TBB packages from the main website. These use a rather awkward directory
+# structure (at least for automatically finding the right files) depending
+# on platform and compiler, but we'll do our best to accommodate it.
+# Not adding the same effort for the precompiled linux builds, though. Those
+# have different versions for CC compiler versions and linux kernels which
+# will never adequately match the user's setup, so there is no feasible way
+# to detect the "best" version to use. The user will have to manually
+# select the right files. (Chances are the distributions are shipping their
+# custom version of tbb, anyway, so the problem is probably nonexistent.)
+if (WIN32 AND MSVC)
+  set(COMPILER_PREFIX "vc7.1")
+  if (MSVC_VERSION EQUAL 1400)
+    set(COMPILER_PREFIX "vc8")
+  elseif(MSVC_VERSION EQUAL 1500)
+    set(COMPILER_PREFIX "vc9")
+  elseif(MSVC_VERSION EQUAL 1600)
+    set(COMPILER_PREFIX "vc10")
+  elseif(MSVC_VERSION EQUAL 1700)
+    set(COMPILER_PREFIX "vc11")
+  elseif(MSVC_VERSION EQUAL 1800)
+    set(COMPILER_PREFIX "vc12")
+  elseif(MSVC_VERSION GREATER_EQUAL 1900 AND MSVC_VERSION LESS_EQUAL 1939)
+      # 1900-1925 actually spans three Visual Studio versions:
+      # 1900      = VS 14.0 (v140 toolset) a.k.a. MSVC 2015
+      # 1910-1919 = VS 15.0 (v141 toolset) a.k.a. MSVC 2017
+      # 1920-1929 = VS 16.0 (v142 toolset) a.k.a. MSVC 2019
+      # 1930-1939 = VS 17.0 (v143 toolset) a.k.a. MSVC 2022
+      #
+      # But these are binary compatible and TBB's open source distribution only
+      # ships a single vs14 lib (as of 2020.0)
+    set(COMPILER_PREFIX "vc14")
+  else()
+    # The next poor soul who finds themselves having to decode visual studio
+    # version conventions may find these helpful:
+    # - https://cmake.org/cmake/help/latest/variable/MSVC_VERSION.html
+    # - https://en.wikipedia.org/wiki/Microsoft_Visual_C%2B%2B#Internal_version_numbering
+    message(AUTHOR_WARNING
+      "Unrecognized MSVC version (${MSVC_VERSION}). "
+      "Please update FindTBB.cmake. "
+      "Some TBB_* CMake variables may need to be set manually."
+    )
+  endif ()
+
+  # for each prefix path, add ia32/64\${COMPILER_PREFIX}\lib to the lib search path
+  foreach (dir IN LISTS TBB_PREFIX_PATH)
+    if (CMAKE_CL_64)
+      list(APPEND TBB_LIB_SEARCH_PATH ${dir}/ia64/${COMPILER_PREFIX}/lib)
+      list(APPEND TBB_LIB_SEARCH_PATH ${dir}/lib/ia64/${COMPILER_PREFIX})
+      list(APPEND TBB_LIB_SEARCH_PATH ${dir}/intel64/${COMPILER_PREFIX}/lib)
+      list(APPEND TBB_LIB_SEARCH_PATH ${dir}/lib/intel64/${COMPILER_PREFIX})
+    else ()
+      list(APPEND TBB_LIB_SEARCH_PATH ${dir}/ia32/${COMPILER_PREFIX}/lib)
+      list(APPEND TBB_LIB_SEARCH_PATH ${dir}/lib/ia32/${COMPILER_PREFIX})
+    endif ()
+  endforeach ()
+endif ()
+
+# For OS X binary distribution, choose libc++ based libraries for Mavericks (10.9)
+# and above and AppleClang
+if (CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND
+    NOT CMAKE_SYSTEM_VERSION VERSION_LESS 13.0)
+  set (USE_LIBCXX OFF)
+  cmake_policy(GET CMP0025 POLICY_VAR)
+
+  if (POLICY_VAR STREQUAL "NEW")
+    if (CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang")
+      set (USE_LIBCXX ON)
+    endif ()
+  else ()
+    if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+      set (USE_LIBCXX ON)
+    endif ()
+  endif ()
+
+  if (USE_LIBCXX)
+    foreach (dir IN LISTS TBB_PREFIX_PATH)
+      list (APPEND TBB_LIB_SEARCH_PATH ${dir}/lib/libc++ ${dir}/libc++/lib)
+    endforeach ()
+  endif ()
+endif ()
+
+# check compiler ABI
+if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+  set(COMPILER_PREFIX)
+  if (NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.7)
+    list(APPEND COMPILER_PREFIX "gcc4.7")
+  endif()
+  if (NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.4)
+    list(APPEND COMPILER_PREFIX "gcc4.4")
+  endif()
+  list(APPEND COMPILER_PREFIX "gcc4.1")
+elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+  set(COMPILER_PREFIX)
+  if (NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 3.6)
+    list(APPEND COMPILER_PREFIX "gcc4.7")
+  endif()
+  list(APPEND COMPILER_PREFIX "gcc4.4")
+else() # Assume compatibility with 4.4 for other compilers
+  list(APPEND COMPILER_PREFIX "gcc4.4")
+endif ()
+
+# if platform architecture is explicitly specified
+set(TBB_ARCH_PLATFORM $ENV{TBB_ARCH_PLATFORM})
+if (TBB_ARCH_PLATFORM)
+  foreach (dir IN LISTS TBB_PREFIX_PATH)
+    list(APPEND TBB_LIB_SEARCH_PATH ${dir}/${TBB_ARCH_PLATFORM}/lib)
+    list(APPEND TBB_LIB_SEARCH_PATH ${dir}/lib/${TBB_ARCH_PLATFORM})
+  endforeach ()
+endif ()
+
+foreach (dir IN LISTS TBB_PREFIX_PATH)
+  foreach (prefix IN LISTS COMPILER_PREFIX)
+    if (CMAKE_SIZEOF_VOID_P EQUAL 8)
+      list(APPEND TBB_LIB_SEARCH_PATH ${dir}/lib/intel64)
+      list(APPEND TBB_LIB_SEARCH_PATH ${dir}/lib/intel64/${prefix})
+      list(APPEND TBB_LIB_SEARCH_PATH ${dir}/intel64/lib)
+      list(APPEND TBB_LIB_SEARCH_PATH ${dir}/intel64/${prefix}/lib)
+    else ()
+      list(APPEND TBB_LIB_SEARCH_PATH ${dir}/lib/ia32)
+      list(APPEND TBB_LIB_SEARCH_PATH ${dir}/lib/ia32/${prefix})
+      list(APPEND TBB_LIB_SEARCH_PATH ${dir}/ia32/lib)
+      list(APPEND TBB_LIB_SEARCH_PATH ${dir}/ia32/${prefix}/lib)
+    endif ()
+  endforeach()
+endforeach ()
+
+# add general search paths
+foreach (dir IN LISTS TBB_PREFIX_PATH)
+  list(APPEND TBB_LIB_SEARCH_PATH ${dir}/lib ${dir}/Lib ${dir}/lib/tbb
+    ${dir}/Libs)
+  list(APPEND TBB_INC_SEARCH_PATH ${dir}/include ${dir}/Include
+    ${dir}/include/tbb)
+endforeach ()
+
+set(TBB_LIBRARY_NAMES tbb)
+get_debug_names(TBB_LIBRARY_NAMES)
+
+find_path(TBB_INCLUDE_DIR
+          NAMES tbb/tbb.h
+          PATHS ${TBB_INC_SEARCH_PATH})
+
+find_library(TBB_LIBRARY_RELEASE
+             NAMES ${TBB_LIBRARY_NAMES}
+             PATHS ${TBB_LIB_SEARCH_PATH})
+find_library(TBB_LIBRARY_DEBUG
+             NAMES ${TBB_LIBRARY_NAMES_DEBUG}
+             PATHS ${TBB_LIB_SEARCH_PATH})
+make_library_set(TBB_LIBRARY)
+
+findpkg_finish(TBB tbb)
+
+#if we haven't found TBB no point on going any further
+if (NOT TBB_FOUND)
+  return()
+endif ()
+
+#=============================================================================
+# Look for TBB's malloc package
+set(TBB_MALLOC_LIBRARY_NAMES tbbmalloc)
+get_debug_names(TBB_MALLOC_LIBRARY_NAMES)
+
+find_path(TBB_MALLOC_INCLUDE_DIR
+          NAMES tbb/tbb.h
+          PATHS ${TBB_INC_SEARCH_PATH})
+
+find_library(TBB_MALLOC_LIBRARY_RELEASE
+             NAMES ${TBB_MALLOC_LIBRARY_NAMES}
+             PATHS ${TBB_LIB_SEARCH_PATH})
+find_library(TBB_MALLOC_LIBRARY_DEBUG
+             NAMES ${TBB_MALLOC_LIBRARY_NAMES_DEBUG}
+             PATHS ${TBB_LIB_SEARCH_PATH})
+make_library_set(TBB_MALLOC_LIBRARY)
+
+findpkg_finish(TBB_MALLOC tbbmalloc)
+
+#=============================================================================
+# Look for TBB's malloc proxy package
+set(TBB_MALLOC_PROXY_LIBRARY_NAMES tbbmalloc_proxy)
+get_debug_names(TBB_MALLOC_PROXY_LIBRARY_NAMES)
+
+find_path(TBB_MALLOC_PROXY_INCLUDE_DIR
+          NAMES tbb/tbbmalloc_proxy.h
+          PATHS ${TBB_INC_SEARCH_PATH})
+
+find_library(TBB_MALLOC_PROXY_LIBRARY_RELEASE
+             NAMES ${TBB_MALLOC_PROXY_LIBRARY_NAMES}
+             PATHS ${TBB_LIB_SEARCH_PATH})
+find_library(TBB_MALLOC_PROXY_LIBRARY_DEBUG
+             NAMES ${TBB_MALLOC_PROXY_LIBRARY_NAMES_DEBUG}
+             PATHS ${TBB_LIB_SEARCH_PATH})
+make_library_set(TBB_MALLOC_PROXY_LIBRARY)
+
+findpkg_finish(TBB_MALLOC_PROXY tbbmalloc_proxy)
+
+
+#=============================================================================
+# Parse all the version numbers from tbb.
+if(NOT TBB_VERSION)
+  if(EXISTS "${TBB_INCLUDE_DIR}/tbb/version.h")
+    # The newer oneTBB provides tbb/version.h but no tbb/tbb_stddef.h.
+    set(version_file "${TBB_INCLUDE_DIR}/tbb/version.h")
+  else()
+    # Older TBB provides tbb/tbb_stddef.h but no tbb/version.h.
+    set(version_file "${TBB_INCLUDE_DIR}/tbb/tbb_stddef.h")
+  endif()
+
+  file(STRINGS
+      "${version_file}"
+      TBB_VERSION_CONTENTS
+      REGEX "VERSION")
+
+  string(REGEX REPLACE
+    ".*#define TBB_VERSION_MAJOR ([0-9]+).*" "\\1"
+    TBB_VERSION_MAJOR "${TBB_VERSION_CONTENTS}")
+
+  string(REGEX REPLACE
+    ".*#define TBB_VERSION_MINOR ([0-9]+).*" "\\1"
+    TBB_VERSION_MINOR "${TBB_VERSION_CONTENTS}")
+
+  string(REGEX REPLACE
+        ".*#define TBB_INTERFACE_VERSION ([0-9]+).*" "\\1"
+        TBB_INTERFACE_VERSION "${TBB_VERSION_CONTENTS}")
+
+  string(REGEX REPLACE
+        ".*#define TBB_COMPATIBLE_INTERFACE_VERSION ([0-9]+).*" "\\1"
+        TBB_COMPATIBLE_INTERFACE_VERSION "${TBB_VERSION_CONTENTS}")
+
+  set(TBB_VERSION "${TBB_VERSION_MAJOR}.${TBB_VERSION_MINOR}")
+endif()
diff --git a/thrust/cmake/README.md b/thrust/cmake/README.md
new file mode 100644
index 000000000..ae296b635
--- /dev/null
+++ b/thrust/cmake/README.md
@@ -0,0 +1,226 @@
+# Using Thrust with CMake
+
+Thrust provides configuration files that simplify using Thrust
+from other CMake projects. Requirements:
+
+- Thrust >= 1.9.10
+- CMake >= 3.15
+
+See the [Fixing Legacy FindThrust.cmake](#fixing-legacy-findthrustcmake)
+section for solutions that work on older Thrust versions.
+
+## User Guide
+
+#### Default Configuration (CUDA)
+
+Thrust is configured using a `thrust_create_target` CMake function that
+assembles a complete interface to the Thrust library:
+
+```cmake
+find_package(Thrust REQUIRED CONFIG)
+thrust_create_target(Thrust)
+target_link_libraries(MyProgram Thrust)
+```
+
+The first argument is the name of the interface target to create, and any
+additional options will be used to configure the target. By default,
+`thrust_create_target` will configure its result to use CUDA acceleration.
+
+If desired, `thrust_create_target` may be called multiple times to build
+several unique Thrust interface targets with different configurations, as
+detailed below.
+
+**Note:** If CMake is unable to locate Thrust, specify the path to Thrust's CMake
+configuration directory (where this README file is located) as `Thrust_DIR`.
+If cloning Thrust from github, this would be
+
+```
+$ cmake . -DThrust_DIR=<thrust git repo root>/thrust/cmake/
+```
+
+#### TBB / OpenMP
+
+To explicitly specify host/device systems, `HOST` and `DEVICE` arguments can be
+passed to `thrust_create_target`. If an explicit system is not specified, the
+target will default to using CPP for host and/or CUDA for device.
+
+```cmake
+thrust_create_target(ThrustTBB DEVICE TBB)
+thrust_create_target(ThrustOMP HOST CPP DEVICE OMP)
+```
+
+will create targets `ThrustTBB` and `ThrustOMP`. Both will use the serial `CPP`
+host system, but will find and use TBB or OpenMP for the device system.
+
+#### Configure Target from Cache Options
+
+To allow a Thrust target to be configurable easily via `cmake-gui` or
+`ccmake`, pass the `FROM_OPTIONS` flag to `thrust_create_target`. This will add
+`THRUST_HOST_SYSTEM` and `THRUST_DEVICE_SYSTEM` options to the CMake cache that
+allow selection from the systems supported by this version of Thrust.
+
+```cmake
+thrust_create_target(Thrust FROM_OPTIONS
+  [HOST_OPTION <option name>]
+  [DEVICE_OPTION <option name>]
+  [HOST_OPTION_DOC <doc string>]
+  [DEVICE_OPTION_DOC <doc string>]
+  [HOST <default host system name>]
+  [DEVICE <default device system name>]
+  [ADVANCED]
+)
+```
+
+The optional arguments have sensible defaults, but may be configured per
+`thrust_create_target` call:
+
+| Argument            | Default                 | Description                     |
+|---------------------|-------------------------|---------------------------------|
+| `HOST_OPTION`       | `THRUST_HOST_SYSTEM`    | Name of cache option for host   |
+| `DEVICE_OPTION`     | `THRUST_DEVICE_SYSTEM`  | Name of cache option for device |
+| `HOST_OPTION_DOC`   | Thrust's host system.   | Docstring for host option       |
+| `DEVICE_OPTION_DOC` | Thrust's device system. | Docstring for device option     |
+| `HOST`              | `CPP`                   | Default host system             |
+| `DEVICE`            | `CUDA`                  | Default device system           |
+| `ADVANCED`          | *N/A*                   | Mark cache options advanced     |
+
+### Specifying Thrust Version Requirements
+
+A specific version of Thrust may be required in the `find_package` call:
+
+```cmake
+find_package(Thrust 1.9.10)
+```
+
+will only consider Thrust installations with version `1.9.10.X`. An exact match
+down to the patch version can be forced by using `EXACT` matching:
+
+```cmake
+find_package(Thrust 1.9.10.1 EXACT)
+```
+
+would only match the 1.9.10.1 release.
+
+#### Using an Explicit TBB or OpenMP CMake Target
+
+When `thrust_create_target` is called, it will lazily load the requested
+systems on-demand through internal `find_package` calls. If a project already
+uses TBB or OpenMP, it may specify a CMake target for Thrust to share instead:
+
+```cmake
+thrust_set_TBB_target(MyTBBTarget)
+thrust_set_OMP_target(MyOMPTarget)
+```
+
+These functions must be called **before** the corresponding system is loaded
+through `thrust_create_target` or `find_package(Thrust COMPONENT [OMP|TBB])`.
+
+#### Using an Explicit libcu++ CMake Target
+
+In contrast to the optional TBB/OMP dependencies, there is no
+`thrust_set_libcudacxx_target` function that specifies an explicit libcu++
+target. This is because libcu++ is always required and must be found during the
+initial `find_target(Thrust)` call that defines these functions.
+
+To force Thrust to use a specific libcu++ target, ensure that either the
+`Thrust::libcudacxx` or `libcudacxx::libcudacxx` targets are defined prior to
+the first invocation of `find_package(Thrust)`. Thrust will automatically use
+these, giving preference to the `Thrust::libcudacxx` target.
+
+#### Testing for Systems
+
+The following functions check if a system has been found, either by lazy loading
+through `thrust_create_target` or as a `find_package` `COMPONENT` /
+`OPTIONAL_COMPONENT`:
+
+```cmake
+# Set var_name to TRUE or FALSE if an individual system has been found:
+thrust_is_cuda_system_found(<var_name>)
+thrust_is_cpp_system_found(<var_name>)
+thrust_is_tbb_system_found(<var_name>)
+thrust_is_omp_system_found(<var_name>)
+
+# Generic version that takes a component name from CUDA, CPP, TBB, OMP:
+thrust_is_system_found(<component_name> <var_name>)
+
+# Defines `THRUST_*_FOUND` variables in the current scope that reflect the
+# state of all known systems. Can be used to refresh these flags after
+# lazy system loading.
+thrust_update_system_found_flags()
+```
+
+#### Debugging
+
+Thrust will produce a detailed log describing its targets, cache options, and
+interfaces when `--log-level=VERBOSE` is passed to CMake 3.15.7 or newer:
+
+```
+$ cmake . --log-level=VERBOSE
+```
+
+This can be handy for inspecting interface and dependency information.
+
+## Fixing Legacy FindThrust.cmake
+
+A community-created `FindThrust.cmake` module exists and is necessary to find
+Thrust installations prior to Thrust 1.9.10. Its usage is discouraged whenever
+possible and the config files in this directory should be strongly preferred.
+However, projects that need to support old versions of Thrust may still need to
+use the legacy `FindThrust.cmake` with pre-1.9.10 installations.
+
+One popular flavor of this find module has a version parsing bug. Projects that
+rely on `FindThrust.cmake` should check for this and patch their copies as
+follows.
+
+Replace:
+
+```cmake
+string( REGEX MATCH "^[0-9]" major ${version} )
+string( REGEX REPLACE "^${major}00" "" version "${version}" )
+string( REGEX MATCH "^[0-9]" minor ${version} )
+string( REGEX REPLACE "^${minor}0" "" version "${version}" )
+```
+
+with:
+
+```cmake
+math(EXPR major "${version} / 100000")
+math(EXPR minor "(${version} / 100) % 1000")
+math(EXPR version "${version} % 100")
+```
+
+# Thrust Developer Documentation
+
+This portion of the file contains descriptions of Thrust's internal CMake target
+structure for Thrust developers. It should not be necessary for users
+who just want to use Thrust from their projects.
+
+## Internal Targets
+
+By default, `find_package(Thrust)` will only create a single `Thrust::Thrust`
+target that describes where the actual Thrust headers are located. It does not
+locate or create configurations for any dependencies; these are lazily loaded
+on-demand by calls to `create_thrust_target`, or when explicitly requested via
+`find_package`'s component mechanism.
+
+As mentioned, the basic Thrust interface is described by the `Thrust::Thrust`
+target.
+
+Each backend system (`CPP`, `CUDA`, `TBB`, `OMP`) is described by multiple
+targets:
+
+- `Thrust::${system}`
+  - Specifies an interface configured to build against all
+    dependencies for this backend (including `Thrust::Thrust`).
+  - For example, the `Thrust::CUDA` target is an interface
+    target that combines the interfaces of both Thrust and CUB.
+- `Thrust::${system}::Host`
+  - Configures an interface for using a specific host system.
+  - Multiple `::Host` targets cannot be combined in the same library/executable.
+    Attempting to do so will produce a CMake configuration error.
+  - Only defined for systems that support being used as the host.
+- `Thrust::${system}::Device`
+  - Configures an interface for using a specific device system.
+  - Multiple `::Device` targets cannot be combined in the same library/executable.
+    Attempting to do so will produce a CMake configuration error.
+  - Only defined for systems that support being used as the device.
diff --git a/thrust/cmake/thrust-config-version.cmake b/thrust/cmake/thrust-config-version.cmake
new file mode 100644
index 000000000..cf9407a4c
--- /dev/null
+++ b/thrust/cmake/thrust-config-version.cmake
@@ -0,0 +1,31 @@
+# Parse version information from version.h:
+include("${CMAKE_CURRENT_LIST_DIR}/thrust-header-search.cmake")
+
+file(READ "${_THRUST_VERSION_INCLUDE_DIR}/thrust/version.h" THRUST_VERSION_HEADER)
+string(REGEX MATCH "#define[ \t]+THRUST_VERSION[ \t]+([0-9]+)" DUMMY "${THRUST_VERSION_HEADER}")
+set(THRUST_VERSION_FLAT ${CMAKE_MATCH_1})
+# Note that Thrust calls this the PATCH number, CMake calls it the TWEAK number:
+string(REGEX MATCH "#define[ \t]+THRUST_PATCH_NUMBER[ \t]+([0-9]+)" DUMMY "${THRUST_VERSION_HEADER}")
+set(THRUST_VERSION_TWEAK ${CMAKE_MATCH_1})
+
+math(EXPR THRUST_VERSION_MAJOR "${THRUST_VERSION_FLAT} / 100000")
+math(EXPR THRUST_VERSION_MINOR "(${THRUST_VERSION_FLAT} / 100) % 1000")
+math(EXPR THRUST_VERSION_PATCH "${THRUST_VERSION_FLAT} % 100") # Thrust: "subminor" CMake: "patch"
+
+set(THRUST_VERSION "${THRUST_VERSION_MAJOR}.${THRUST_VERSION_MINOR}.${THRUST_VERSION_PATCH}.${THRUST_VERSION_TWEAK}")
+
+set(PACKAGE_VERSION ${THRUST_VERSION})
+set(PACKAGE_VERSION_COMPATIBLE FALSE)
+set(PACKAGE_VERSION_EXACT FALSE)
+set(PACKAGE_VERSION_UNSUITABLE FALSE)
+
+if(PACKAGE_VERSION VERSION_GREATER_EQUAL PACKAGE_FIND_VERSION)
+  if(THRUST_VERSION_MAJOR VERSION_EQUAL PACKAGE_FIND_VERSION_MAJOR AND
+     THRUST_VERSION_MINOR VERSION_GREATER_EQUAL PACKAGE_FIND_VERSION_MINOR)
+    set(PACKAGE_VERSION_COMPATIBLE TRUE)
+  endif()
+
+  if(PACKAGE_FIND_VERSION VERSION_EQUAL PACKAGE_VERSION)
+    set(PACKAGE_VERSION_EXACT TRUE)
+  endif()
+endif()
diff --git a/thrust/cmake/thrust-config.cmake b/thrust/cmake/thrust-config.cmake
new file mode 100644
index 000000000..fe88a961c
--- /dev/null
+++ b/thrust/cmake/thrust-config.cmake
@@ -0,0 +1,745 @@
+#
+# find_package(Thrust) config file.
+#
+# Provided by NVIDIA under the same license as the associated Thrust library.
+#
+# Reply-To: Allison Vacanti <alliepiper16@gmail.com>
+#
+# *****************************************************************************
+# **     The following is a short reference to using Thrust from CMake.      **
+# ** For more details, see the README.md in the same directory as this file. **
+# *****************************************************************************
+#
+# # General Usage:
+# find_package(Thrust REQUIRED CONFIG)
+# thrust_create_target(Thrust [options])
+# target_link_libraries(some_project_lib Thrust)
+#
+# # Create default target with: HOST=CPP DEVICE=CUDA
+# thrust_create_target(TargetName)
+#
+# # Create target with: HOST=CPP DEVICE=TBB
+# thrust_create_target(TargetName DEVICE TBB)
+#
+# # Create target with: HOST=TBB DEVICE=OMP
+# thrust_create_target(TargetName HOST TBB DEVICE OMP)
+#
+# # Create CMake cache options THRUST_[HOST|DEVICE]_SYSTEM and configure a
+# # target from them. This allows these systems to be changed by developers at
+# # configure time, per build.
+# thrust_create_target(TargetName FROM_OPTIONS
+#   [HOST_OPTION <option_name>]      # Optionally rename the host system option
+#   [DEVICE_OPTION <option_name>]    # Optionally rename the device system option
+#   [HOST_OPTION_DOC <doc_string>]   # Optionally change the cache label
+#   [DEVICE_OPTION_DOC <doc_string>] # Optionally change the cache label
+#   [HOST <default system>]          # Optionally change the default backend
+#   [DEVICE <default system>]        # Optionally change the default backend
+#   [ADVANCED]                       # Optionally mark options as advanced
+# )
+#
+# # Use a custom TBB, CUB, and/or OMP
+# # (Note that once set, these cannot be changed. This includes COMPONENT
+# # preloading and lazy lookups in thrust_create_target)
+# find_package(Thrust REQUIRED)
+# thrust_set_CUB_target(MyCUBTarget)  # MyXXXTarget contains an existing
+# thrust_set_TBB_target(MyTBBTarget)  # interface to XXX for Thrust to use.
+# thrust_set_OMP_target(MyOMPTarget)
+# thrust_create_target(ThrustWithMyCUB DEVICE CUDA)
+# thrust_create_target(ThrustWithMyTBB DEVICE TBB)
+# thrust_create_target(ThrustWithMyOMP DEVICE OMP)
+#
+# # Create target with HOST=CPP DEVICE=CUDA and some advanced flags set
+# thrust_create_target(TargetName
+#   IGNORE_DEPRECATED_API         # Silence build warnings about deprecated APIs
+#   IGNORE_DEPRECATED_CPP_DIALECT # Silence build warnings about deprecated compilers and C++ standards
+#   IGNORE_DEPRECATED_CPP_11      # Only silence deprecation warnings for C++11
+#   IGNORE_DEPRECATED_COMPILER    # Only silence deprecation warnings for old compilers
+#   IGNORE_CUB_VERSION            # Skip configure-time and compile-time CUB version checks
+# )
+#
+# # Test if a particular system has been loaded. ${var_name} is set to TRUE or
+# # FALSE to indicate if "system" is found.
+# thrust_is_system_found(<system> <var_name>)
+# thrust_is_cuda_system_found(<var_name>)
+# thrust_is_tbb_system_found(<var_name>)
+# thrust_is_omp_system_found(<var_name>)
+# thrust_is_cpp_system_found(<var_name>)
+#
+# # Define / update THRUST_${system}_FOUND flags in current scope
+# thrust_update_system_found_flags()
+#
+# # View verbose log with target and dependency information:
+# $ cmake . --log-level=VERBOSE (CMake 3.15.7 and above)
+#
+# # Print debugging output to status channel:
+# thrust_debug_internal_targets()
+# thrust_debug_target(TargetName "${THRUST_VERSION}")
+
+cmake_minimum_required(VERSION 3.15)
+
+# Minimum supported libcudacxx version:
+set(thrust_libcudacxx_version 1.8.0)
+
+################################################################################
+# User variables and APIs. Users can rely on these:
+#
+
+# Advertise system options:
+set(THRUST_HOST_SYSTEM_OPTIONS
+  CPP OMP TBB
+  CACHE INTERNAL "Valid Thrust host systems."
+  FORCE
+)
+set(THRUST_DEVICE_SYSTEM_OPTIONS
+  CUDA CPP OMP TBB
+  CACHE INTERNAL "Valid Thrust device systems"
+  FORCE
+)
+
+# Workaround cmake issue #20670 https://gitlab.kitware.com/cmake/cmake/-/issues/20670
+set(THRUST_VERSION ${${CMAKE_FIND_PACKAGE_NAME}_VERSION} CACHE INTERNAL "" FORCE)
+set(THRUST_VERSION_MAJOR ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_MAJOR} CACHE INTERNAL "" FORCE)
+set(THRUST_VERSION_MINOR ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_MINOR} CACHE INTERNAL "" FORCE)
+set(THRUST_VERSION_PATCH ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_PATCH} CACHE INTERNAL "" FORCE)
+set(THRUST_VERSION_TWEAK ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_TWEAK} CACHE INTERNAL "" FORCE)
+set(THRUST_VERSION_COUNT ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_COUNT} CACHE INTERNAL "" FORCE)
+
+function(thrust_create_target target_name)
+  thrust_debug("Assembling target ${target_name}. Options: ${ARGN}" internal)
+  set(options
+    ADVANCED
+    FROM_OPTIONS
+    IGNORE_CUB_VERSION_CHECK
+    IGNORE_DEPRECATED_API
+    IGNORE_DEPRECATED_COMPILER
+    IGNORE_DEPRECATED_CPP_11
+    IGNORE_DEPRECATED_CPP_DIALECT
+  )
+  set(keys
+    DEVICE
+    DEVICE_OPTION
+    DEVICE_OPTION_DOC
+    HOST
+    HOST_OPTION
+    HOST_OPTION_DOC
+  )
+  cmake_parse_arguments(TCT "${options}" "${keys}" "" ${ARGN})
+  if (TCT_UNPARSED_ARGUMENTS)
+    message(AUTHOR_WARNING
+      "Unrecognized arguments passed to thrust_create_target: "
+      ${TCT_UNPARSED_ARGUMENTS}
+    )
+  endif()
+
+  # Check that the main Thrust internal target is available
+  # (functions have global scope, targets have directory scope, so this
+  # might happen)
+  if (NOT TARGET Thrust::Thrust)
+    message(AUTHOR_WARNING
+      "The `thrust_create_target` function was called outside the scope of the "
+      "thrust targets. Call find_package again to recreate targets."
+    )
+  endif()
+
+  _thrust_set_if_undefined(TCT_HOST CPP)
+  _thrust_set_if_undefined(TCT_DEVICE CUDA)
+  _thrust_set_if_undefined(TCT_HOST_OPTION THRUST_HOST_SYSTEM)
+  _thrust_set_if_undefined(TCT_DEVICE_OPTION THRUST_DEVICE_SYSTEM)
+  _thrust_set_if_undefined(TCT_HOST_OPTION_DOC "Thrust host system.")
+  _thrust_set_if_undefined(TCT_DEVICE_OPTION_DOC "Thrust device system.")
+
+  if (NOT TCT_HOST IN_LIST THRUST_HOST_SYSTEM_OPTIONS)
+    message(FATAL_ERROR
+      "Requested HOST=${TCT_HOST}; must be one of ${THRUST_HOST_SYSTEM_OPTIONS}"
+    )
+  endif()
+
+  if (NOT TCT_DEVICE IN_LIST THRUST_DEVICE_SYSTEM_OPTIONS)
+    message(FATAL_ERROR
+      "Requested DEVICE=${TCT_DEVICE}; must be one of ${THRUST_DEVICE_SYSTEM_OPTIONS}"
+    )
+  endif()
+
+  if (TCT_FROM_OPTIONS)
+    _thrust_create_cache_options(
+      ${TCT_HOST} ${TCT_DEVICE}
+      ${TCT_HOST_OPTION} ${TCT_DEVICE_OPTION}
+      ${TCT_HOST_OPTION_DOC} ${TCT_DEVICE_OPTION_DOC}
+      ${TCT_ADVANCED}
+    )
+    set(TCT_HOST ${${TCT_HOST_OPTION}})
+    set(TCT_DEVICE ${${TCT_DEVICE_OPTION}})
+    thrust_debug("Current option settings:" internal)
+    thrust_debug("  - ${TCT_HOST_OPTION}=${TCT_HOST}" internal)
+    thrust_debug("  - ${TCT_DEVICE_OPTION}=${TCT_DEVICE}" internal)
+  endif()
+
+  _thrust_find_backend(${TCT_HOST} REQUIRED)
+  _thrust_find_backend(${TCT_DEVICE} REQUIRED)
+
+  # We can just create an INTERFACE IMPORTED target here instead of going
+  # through _thrust_declare_interface_alias as long as we aren't hanging any
+  # Thrust/CUB include paths directly on ${target_name}.
+  add_library(${target_name} INTERFACE IMPORTED)
+  target_link_libraries(${target_name}
+    INTERFACE
+    Thrust::${TCT_HOST}::Host
+    Thrust::${TCT_DEVICE}::Device
+  )
+
+  # This would be nice to enforce, but breaks when using old cmake + new
+  # compiler, since cmake doesn't know what features the new compiler version
+  # supports.
+  # Leaving this here as a reminder not to add it back. Just let the
+  # compile-time checks in thrust/detail/config/cpp_dialect.h handle it.
+  #
+  #  if (NOT TCT_IGNORE_DEPRECATED_CPP_DIALECT)
+  #    if (TCT_IGNORE_DEPRECATED_CPP_11)
+  #      target_compile_features(${target_name} INTERFACE cxx_std_11)
+  #    else()
+  #      target_compile_features(${target_name} INTERFACE cxx_std_14)
+  #    endif()
+  #  endif()
+
+  if (TCT_IGNORE_DEPRECATED_CPP_DIALECT)
+    target_compile_definitions(${target_name} INTERFACE "THRUST_IGNORE_DEPRECATED_CPP_DIALECT")
+  endif()
+
+  if (TCT_IGNORE_DEPRECATED_API)
+    target_compile_definitions(${target_name} INTERFACE "THRUST_IGNORE_DEPRECATED_API")
+  endif()
+
+  if (TCT_IGNORE_DEPRECATED_CPP_11)
+    target_compile_definitions(${target_name} INTERFACE "THRUST_IGNORE_DEPRECATED_CPP_11")
+  endif()
+
+  if (TCT_IGNORE_DEPRECATED_COMPILER)
+    target_compile_definitions(${target_name} INTERFACE "THRUST_IGNORE_DEPRECATED_COMPILER")
+  endif()
+
+  if (TCT_IGNORE_CUB_VERSION_CHECK)
+    target_compile_definitions(${target_name} INTERFACE "THRUST_IGNORE_CUB_VERSION_CHECK")
+  else()
+    if (("${TCT_HOST}" STREQUAL "CUDA" OR "${TCT_DEVICE}" STREQUAL "CUDA") AND
+    (NOT THRUST_VERSION VERSION_EQUAL THRUST_CUB_VERSION))
+      message(FATAL_ERROR
+        "The version of CUB found by CMake is not compatible with this release of Thrust. "
+        "CUB is now included in the CUDA Toolkit, so you no longer need to use your own checkout of CUB. "
+        "Pass IGNORE_CUB_VERSION_CHECK to thrust_create_target to ignore. "
+        "(CUB ${THRUST_CUB_VERSION}, Thrust ${THRUST_VERSION})."
+        )
+    endif()
+  endif()
+
+  thrust_debug_target(${target_name} "Thrust ${THRUST_VERSION}"  internal)
+endfunction()
+
+function(thrust_is_system_found system var_name)
+  if (TARGET Thrust::${system})
+    set(${var_name} TRUE PARENT_SCOPE)
+  else()
+    set(${var_name} FALSE PARENT_SCOPE)
+  endif()
+endfunction()
+
+function(thrust_is_cpp_system_found var_name)
+  thrust_is_system_found(CPP ${var_name})
+  set(${var_name} ${${var_name}} PARENT_SCOPE)
+endfunction()
+
+function(thrust_is_cuda_system_found var_name)
+  thrust_is_system_found(CUDA ${var_name})
+  set(${var_name} ${${var_name}} PARENT_SCOPE)
+endfunction()
+
+function(thrust_is_tbb_system_found var_name)
+  thrust_is_system_found(TBB ${var_name})
+  set(${var_name} ${${var_name}} PARENT_SCOPE)
+endfunction()
+
+function(thrust_is_omp_system_found var_name)
+  thrust_is_system_found(OMP ${var_name})
+  set(${var_name} ${${var_name}} PARENT_SCOPE)
+endfunction()
+
+# Since components are loaded lazily, this will refresh the
+# THRUST_${component}_FOUND flags in the current scope.
+# Alternatively, check system states individually using the
+# thrust_is_system_found functions.
+macro(thrust_update_system_found_flags)
+  set(THRUST_FOUND TRUE)
+  thrust_is_system_found(CPP  THRUST_CPP_FOUND)
+  thrust_is_system_found(CUDA THRUST_CUDA_FOUND)
+  thrust_is_system_found(TBB  THRUST_TBB_FOUND)
+  thrust_is_system_found(OMP  THRUST_OMP_FOUND)
+endmacro()
+
+function(thrust_debug msg)
+  # Use the VERBOSE channel when called internally
+  # Run `cmake . --log-level=VERBOSE` to view.
+  if ("${ARGN}" STREQUAL "internal")
+    # If CMake is too old to know about the VERBOSE channel, just be silent.
+    # Users reproduce much the same output on the STATUS channel by using:
+    # thrust_create_target(Thrust [...])
+    # thrust_debug_internal_targets()
+    # thrust_debug_target(Thrust)
+    if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.15.7")
+      set(channel VERBOSE)
+    else()
+      return()
+    endif()
+  else()
+    set(channel STATUS)
+  endif()
+
+  message(${channel} "Thrust: ${msg}")
+endfunction()
+
+# Print details of the specified target.
+function(thrust_debug_target target_name version)
+  if (NOT TARGET ${target_name})
+    return()
+  endif()
+
+  set(is_internal "${ARGN}")
+
+  if (version)
+    set(version "(${version})")
+  endif()
+
+  thrust_debug("TargetInfo: ${target_name}: ${version}" ${is_internal})
+
+  function(_thrust_print_prop_if_set target_name prop)
+    get_target_property(value ${target_name} ${prop})
+    if (value)
+      thrust_debug("TargetInfo: ${target_name} > ${prop}: ${value}" ${is_internal})
+    endif()
+  endfunction()
+
+  function(_thrust_print_imported_prop_if_set target_name prop)
+    get_target_property(imported ${target_name} IMPORTED)
+    get_target_property(type ${target_name} TYPE)
+    if (imported AND NOT ${type} STREQUAL "INTERFACE_LIBRARY")
+      _thrust_print_prop_if_set(${target_name} ${prop})
+    endif()
+  endfunction()
+
+  _thrust_print_prop_if_set(${target_name} ALIASED_TARGET)
+  _thrust_print_prop_if_set(${target_name} IMPORTED)
+  _thrust_print_prop_if_set(${target_name} INTERFACE_COMPILE_DEFINITIONS)
+  _thrust_print_prop_if_set(${target_name} INTERFACE_COMPILE_FEATURES)
+  _thrust_print_prop_if_set(${target_name} INTERFACE_COMPILE_OPTIONS)
+  _thrust_print_prop_if_set(${target_name} INTERFACE_INCLUDE_DIRECTORIES)
+  _thrust_print_prop_if_set(${target_name} INTERFACE_LINK_DEPENDS)
+  _thrust_print_prop_if_set(${target_name} INTERFACE_LINK_DIRECTORIES)
+  _thrust_print_prop_if_set(${target_name} INTERFACE_LINK_LIBRARIES)
+  _thrust_print_prop_if_set(${target_name} INTERFACE_LINK_OPTIONS)
+  _thrust_print_prop_if_set(${target_name} INTERFACE_SYSTEM_INCLUDE_DIRECTORIES)
+  _thrust_print_prop_if_set(${target_name} INTERFACE_THRUST_HOST)
+  _thrust_print_prop_if_set(${target_name} INTERFACE_THRUST_DEVICE)
+  _thrust_print_imported_prop_if_set(${target_name} IMPORTED_LOCATION)
+  _thrust_print_imported_prop_if_set(${target_name} IMPORTED_LOCATION_DEBUG)
+  _thrust_print_imported_prop_if_set(${target_name} IMPORTED_LOCATION_RELEASE)
+endfunction()
+
+function(thrust_debug_internal_targets)
+  function(_thrust_debug_backend_targets backend version)
+    thrust_debug_target(Thrust::${backend} "${version}")
+    thrust_debug_target(Thrust::${backend}::Host "${version}")
+    thrust_debug_target(Thrust::${backend}::Device "${version}")
+  endfunction()
+
+  thrust_debug_target(Thrust::Thrust "${THRUST_VERSION}")
+
+  _thrust_debug_backend_targets(CPP "Thrust ${THRUST_VERSION}")
+
+  _thrust_debug_backend_targets(OMP "${THRUST_OMP_VERSION}")
+  thrust_debug_target(OpenMP::OpenMP_CXX "${THRUST_OMP_VERSION}")
+
+  _thrust_debug_backend_targets(TBB "${THRUST_TBB_VERSION}")
+  thrust_debug_target(TBB:tbb "${THRUST_TBB_VERSION}")
+
+  _thrust_debug_backend_targets(CUDA "CUB ${THRUST_CUB_VERSION}")
+  thrust_debug_target(CUB::CUB "${THRUST_CUB_VERSION}")
+  thrust_debug_target(libcudacxx::libcudacxx "${THRUST_libcudacxx_VERSION}")
+endfunction()
+
+################################################################################
+# Internal utilities. Subject to change.
+#
+
+function(_thrust_set_if_undefined var)
+  if (NOT DEFINED ${var})
+    set(${var} ${ARGN} PARENT_SCOPE)
+  endif()
+endfunction()
+
+function(_thrust_declare_interface_alias alias_name ugly_name)
+  # 1) Only IMPORTED and ALIAS targets can be placed in a namespace.
+  # 2) When an IMPORTED library is linked to another target, its include
+  #    directories are treated as SYSTEM includes.
+  # 3) nvcc will automatically check the CUDA Toolkit include path *before* the
+  #    system includes. This means that the Toolkit Thrust will *always* be used
+  #    during compilation, and the include paths of an IMPORTED Thrust::Thrust
+  #    target will never have any effect.
+  # 4) This behavior can be fixed by setting the property NO_SYSTEM_FROM_IMPORTED
+  #    on EVERY target that links to Thrust::Thrust. This would be a burden and a
+  #    footgun for our users. Forgetting this would silently pull in the wrong thrust!
+  # 5) A workaround is to make a non-IMPORTED library outside of the namespace,
+  #    configure it, and then ALIAS it into the namespace (or ALIAS and then
+  #    configure, that seems to work too).
+  add_library(${ugly_name} INTERFACE)
+  add_library(${alias_name} ALIAS ${ugly_name})
+endfunction()
+
+# Create cache options for selecting the user/device systems with ccmake/cmake-gui.
+function(_thrust_create_cache_options host device host_option device_option host_doc device_doc advanced)
+  thrust_debug("Creating system cache options: (advanced=${advanced})" internal)
+  thrust_debug("  - Host Option=${host_option} Default=${host} Doc='${host_doc}'" internal)
+  thrust_debug("  - Device Option=${device_option} Default=${device} Doc='${device_doc}'" internal)
+  set(${host_option} ${host} CACHE STRING "${host_doc}")
+  set_property(CACHE ${host_option} PROPERTY STRINGS ${THRUST_HOST_SYSTEM_OPTIONS})
+  set(${device_option} ${device} CACHE STRING "${device_doc}")
+  set_property(CACHE ${device_option} PROPERTY STRINGS ${THRUST_DEVICE_SYSTEM_OPTIONS})
+  if (advanced)
+    mark_as_advanced(${host_option} ${device_option})
+  endif()
+endfunction()
+
+# Create Thrust::${backend}::Host and Thrust::${backend}::Device targets.
+# Assumes that `Thrust::${backend}` and `_Thrust_${backend}` have been created
+# by _thrust_declare_interface_alias and configured to bring in system
+# dependency interfaces (including Thrust::Thrust).
+function(_thrust_setup_system backend)
+  set(backend_target_alias "Thrust::${backend}")
+
+  if (backend IN_LIST THRUST_HOST_SYSTEM_OPTIONS)
+    set(host_target "_Thrust_${backend}_Host")
+    set(host_target_alias "Thrust::${backend}::Host")
+    if (NOT TARGET ${host_target_alias})
+      _thrust_declare_interface_alias(${host_target_alias} ${host_target})
+      target_compile_definitions(${host_target} INTERFACE
+        "THRUST_HOST_SYSTEM=THRUST_HOST_SYSTEM_${backend}")
+      target_link_libraries(${host_target} INTERFACE ${backend_target_alias})
+      set_property(TARGET ${host_target} PROPERTY INTERFACE_THRUST_HOST ${backend})
+      set_property(TARGET ${host_target} APPEND PROPERTY COMPATIBLE_INTERFACE_STRING THRUST_HOST)
+      thrust_debug_target(${host_target_alias} "" internal)
+    endif()
+  endif()
+
+  if (backend IN_LIST THRUST_DEVICE_SYSTEM_OPTIONS)
+    set(device_target "_Thrust_${backend}_Device")
+    set(device_target_alias "Thrust::${backend}::Device")
+    if (NOT TARGET ${device_target_alias})
+      _thrust_declare_interface_alias(${device_target_alias} ${device_target})
+      target_compile_definitions(${device_target} INTERFACE
+        "THRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_${backend}")
+      target_link_libraries(${device_target} INTERFACE ${backend_target_alias})
+      set_property(TARGET ${device_target} PROPERTY INTERFACE_THRUST_DEVICE ${backend})
+      set_property(TARGET ${device_target} APPEND PROPERTY COMPATIBLE_INTERFACE_STRING THRUST_DEVICE)
+      thrust_debug_target(${device_target_alias} "" internal)
+    endif()
+  endif()
+endfunction()
+
+# Use the provided cub_target for the CUDA backend. If Thrust::CUB already
+# exists, this call has no effect.
+function(thrust_set_CUB_target cub_target)
+  if (NOT TARGET Thrust::CUB)
+    thrust_debug("Setting CUB target to ${cub_target}" internal)
+    # Workaround cmake issue #20670 https://gitlab.kitware.com/cmake/cmake/-/issues/20670
+    set(THRUST_CUB_VERSION ${CUB_VERSION} CACHE INTERNAL
+      "CUB version used by Thrust"
+      FORCE
+    )
+    _thrust_declare_interface_alias(Thrust::CUB _Thrust_CUB)
+    target_link_libraries(_Thrust_CUB INTERFACE ${cub_target})
+    thrust_debug_target(${cub_target} "${THRUST_CUB_VERSION}" internal)
+    thrust_debug_target(Thrust::CUB "CUB ${THRUST_CUB_VERSION}" internal)
+  endif()
+endfunction()
+
+# Internal use only -- libcudacxx must be found during the initial
+# `find_package(Thrust)` call and cannot be set afterwards. See README.md in
+# this directory for details on using a specific libcudacxx target.
+function(_thrust_set_libcudacxx_target libcudacxx_target)
+  if (NOT TARGET Thrust::libcudacxx)
+    thrust_debug("Setting libcudacxx target to ${libcudacxx_target}" internal)
+    # Workaround cmake issue #20670 https://gitlab.kitware.com/cmake/cmake/-/issues/20670
+    set(THRUST_libcudacxx_VERSION ${libcudacxx_VERSION} CACHE INTERNAL
+      "libcudacxx version used by Thrust"
+      FORCE
+    )
+    _thrust_declare_interface_alias(Thrust::libcudacxx _Thrust_libcudacxx)
+    target_link_libraries(_Thrust_libcudacxx INTERFACE ${libcudacxx_target})
+    thrust_debug_target(${libcudacxx_target} "${THRUST_libcudacxx_VERSION}" internal)
+    thrust_debug_target(Thrust::libcudacxx "libcudacxx ${THRUST_libcudacxx_VERSION}" internal)
+  endif()
+endfunction()
+
+# Use the provided tbb_target for the TBB backend. If Thrust::TBB already
+# exists, this call has no effect.
+function(thrust_set_TBB_target tbb_target)
+  if (NOT TARGET Thrust::TBB)
+    thrust_debug("Setting TBB target to ${tbb_target}" internal)
+    # Workaround cmake issue #20670 https://gitlab.kitware.com/cmake/cmake/-/issues/20670
+    set(THRUST_TBB_VERSION ${TBB_VERSION} CACHE INTERNAL
+      "TBB version used by Thrust"
+      FORCE
+    )
+    _thrust_declare_interface_alias(Thrust::TBB _Thrust_TBB)
+    target_link_libraries(_Thrust_TBB INTERFACE Thrust::Thrust ${tbb_target})
+    thrust_debug_target(${tbb_target} "${THRUST_TBB_VERSION}" internal)
+    thrust_debug_target(Thrust::TBB "${THRUST_TBB_VERSION}" internal)
+    _thrust_setup_system(TBB)
+  endif()
+endfunction()
+
+# Use the provided omp_target for the OMP backend. If Thrust::OMP already
+# exists, this call has no effect.
+function(thrust_set_OMP_target omp_target)
+  if (NOT TARGET Thrust::OMP)
+    thrust_debug("Setting OMP target to ${omp_target}" internal)
+    # Workaround cmake issue #20670 https://gitlab.kitware.com/cmake/cmake/-/issues/20670
+    set(THRUST_OMP_VERSION ${OpenMP_CXX_VERSION} CACHE INTERNAL
+      "OpenMP version used by Thrust"
+      FORCE
+    )
+    _thrust_declare_interface_alias(Thrust::OMP _Thrust_OMP)
+    target_link_libraries(_Thrust_OMP INTERFACE Thrust::Thrust ${omp_target})
+    thrust_debug_target(${omp_target} "${THRUST_OMP_VERSION}" internal)
+    thrust_debug_target(Thrust::OMP "${THRUST_OMP_VERSION}" internal)
+    _thrust_setup_system(OMP)
+  endif()
+endfunction()
+
+function(_thrust_find_CPP required)
+  if (NOT TARGET Thrust::CPP)
+    thrust_debug("Generating CPP targets." internal)
+    _thrust_declare_interface_alias(Thrust::CPP _Thrust_CPP)
+    target_link_libraries(_Thrust_CPP INTERFACE Thrust::Thrust)
+    thrust_debug_target(Thrust::CPP "Thrust ${THRUST_VERSION}" internal)
+    _thrust_setup_system(CPP)
+  endif()
+endfunction()
+
+# This must be a macro instead of a function to ensure that backends passed to
+# find_package(Thrust COMPONENTS [...]) have their full configuration loaded
+# into the current scope. This provides at least some remedy for CMake issue
+# #20670 -- otherwise variables like CUB_VERSION, etc won't be in the caller's
+# scope.
+macro(_thrust_find_CUDA required)
+  if (NOT TARGET Thrust::CUB)
+    thrust_debug("Searching for CUB ${required}" internal)
+    find_package(CUB ${THRUST_VERSION} CONFIG
+      ${_THRUST_QUIET_FLAG}
+      ${required}
+      NO_DEFAULT_PATH # Only check the explicit HINTS below:
+      HINTS
+        "${_THRUST_INCLUDE_DIR}/dependencies/cub" # Source layout (GitHub)
+        "${_THRUST_INCLUDE_DIR}/../cub/cub/cmake" # Source layout (Perforce)
+        "${_THRUST_CMAKE_DIR}/.."                 # Install layout
+    )
+
+    if (TARGET CUB::CUB)
+      thrust_set_CUB_target(CUB::CUB)
+    else()
+      thrust_debug("CUB not found!" internal)
+    endif()
+  endif()
+
+  if (NOT TARGET Thrust::CUDA)
+    _thrust_declare_interface_alias(Thrust::CUDA _Thrust_CUDA)
+    _thrust_setup_system(CUDA)
+    target_link_libraries(_Thrust_CUDA INTERFACE
+      Thrust::Thrust
+      Thrust::CUB
+    )
+    thrust_debug_target(Thrust::CUDA "" internal)
+  endif()
+endmacro()
+
+# This must be a macro instead of a function to ensure that backends passed to
+# find_package(Thrust COMPONENTS [...]) have their full configuration loaded
+# into the current scope. This provides at least some remedy for CMake issue
+# #20670 -- otherwise variables like TBB_VERSION, etc won't be in the caller's
+# scope.
+macro(_thrust_find_TBB required)
+  if(NOT TARGET Thrust::TBB)
+    thrust_debug("Searching for TBB ${required}" internal)
+    # Swap in a temporary module path to make sure we use our FindTBB.cmake
+    set(_THRUST_STASH_MODULE_PATH "${CMAKE_MODULE_PATH}")
+    set(CMAKE_MODULE_PATH "${_THRUST_CMAKE_DIR}")
+
+    # Push policy CMP0074 to silence warnings about TBB_ROOT being set. This
+    # var is used unconventionally in this FindTBB.cmake module.
+    # Someday we'll have a suitable TBB cmake configuration and can avoid this.
+    cmake_policy(PUSH)
+    cmake_policy(SET CMP0074 OLD)
+    set(THRUST_TBB_ROOT "" CACHE PATH "Path to the root of the TBB installation.")
+    if (TBB_ROOT AND NOT THRUST_TBB_ROOT)
+      message(
+        "Warning: TBB_ROOT is set. "
+        "Thrust uses THRUST_TBB_ROOT to avoid issues with CMake Policy CMP0074. "
+        "Please set this variable instead when using Thrust with TBB."
+      )
+    endif()
+    set(TBB_ROOT "${THRUST_TBB_ROOT}")
+    set(_THRUST_STASH_TBB_ROOT "${TBB_ROOT}")
+
+    find_package(TBB
+      ${_THRUST_QUIET_FLAG}
+      ${required}
+    )
+
+    cmake_policy(POP)
+    set(TBB_ROOT "${_THRUST_STASH_TBB_ROOT}")
+    set(CMAKE_MODULE_PATH "${_THRUST_STASH_MODULE_PATH}")
+
+    if (TARGET TBB::tbb)
+      thrust_set_TBB_target(TBB::tbb)
+    else()
+      thrust_debug("TBB not found!" internal)
+    endif()
+  endif()
+endmacro()
+
+# Wrap the OpenMP flags for CUDA targets
+function(thrust_fixup_omp_target omp_target)
+  get_target_property(opts ${omp_target} INTERFACE_COMPILE_OPTIONS)
+  if (opts MATCHES "\\$<\\$<COMPILE_LANGUAGE:CXX>:([^>]*)>")
+    target_compile_options(${omp_target} INTERFACE
+      $<$<AND:$<COMPILE_LANGUAGE:CUDA>,$<CUDA_COMPILER_ID:NVIDIA>>:-Xcompiler=${CMAKE_MATCH_1}>
+    )
+  endif()
+endfunction()
+
+# This must be a macro instead of a function to ensure that backends passed to
+# find_package(Thrust COMPONENTS [...]) have their full configuration loaded
+# into the current scope. This provides at least some remedy for CMake issue
+# #20670 -- otherwise variables like OpenMP_CXX_VERSION, etc won't be in the caller's
+# scope.
+macro(_thrust_find_OMP required)
+  if (NOT TARGET Thrust::OMP)
+    thrust_debug("Searching for OMP ${required}" internal)
+    find_package(OpenMP
+      ${_THRUST_QUIET_FLAG}
+      ${_THRUST_REQUIRED_FLAG_OMP}
+      COMPONENTS CXX
+    )
+
+    if (TARGET OpenMP::OpenMP_CXX)
+      thrust_fixup_omp_target(OpenMP::OpenMP_CXX)
+      thrust_set_OMP_target(OpenMP::OpenMP_CXX)
+    else()
+      thrust_debug("OpenMP::OpenMP_CXX not found!" internal)
+    endif()
+  endif()
+endmacro()
+
+# This must be a macro instead of a function to ensure that backends passed to
+# find_package(Thrust COMPONENTS [...]) have their full configuration loaded
+# into the current scope. This provides at least some remedy for CMake issue
+# #20670 -- otherwise variables like CUB_VERSION, etc won't be in the caller's
+# scope.
+macro(_thrust_find_backend backend required)
+  # Unfortunately, _thrust_find_${backend}(req) is not valid CMake syntax. Hence
+  # why this function exists.
+  if ("${backend}" STREQUAL "CPP")
+    _thrust_find_CPP("${required}")
+  elseif ("${backend}" STREQUAL "CUDA")
+    _thrust_find_CUDA("${required}")
+  elseif ("${backend}" STREQUAL "TBB")
+    _thrust_find_TBB("${required}")
+  elseif ("${backend}" STREQUAL "OMP")
+    _thrust_find_OMP("${required}")
+  else()
+    message(FATAL_ERROR "_thrust_find_backend: Invalid system: ${backend}")
+  endif()
+endmacro()
+
+################################################################################
+# Initialization. Executed inside find_package(Thrust) call.
+#
+
+if (${CMAKE_FIND_PACKAGE_NAME}_FIND_QUIETLY)
+  set(_THRUST_QUIET ON CACHE INTERNAL "Quiet mode enabled for Thrust find_package calls." FORCE)
+  set(_THRUST_QUIET_FLAG "QUIET" CACHE INTERNAL "" FORCE)
+else()
+  unset(_THRUST_QUIET CACHE)
+  unset(_THRUST_QUIET_FLAG CACHE)
+endif()
+
+set(_THRUST_CMAKE_DIR "${CMAKE_CURRENT_LIST_DIR}" CACHE INTERNAL
+  "Location of thrust-config.cmake"
+  FORCE
+)
+
+# Internal target that actually holds the Thrust interface. Used by all other Thrust targets.
+if (NOT TARGET Thrust::Thrust)
+  _thrust_declare_interface_alias(Thrust::Thrust _Thrust_Thrust)
+  # Pull in the include dir detected by thrust-config-version.cmake
+  set(_THRUST_INCLUDE_DIR "${_THRUST_VERSION_INCLUDE_DIR}"
+    CACHE INTERNAL "Location of Thrust headers."
+    FORCE
+  )
+  unset(_THRUST_VERSION_INCLUDE_DIR CACHE) # Clear tmp variable from cache
+  target_include_directories(_Thrust_Thrust INTERFACE "${_THRUST_INCLUDE_DIR}")
+  thrust_debug_target(Thrust::Thrust "${THRUST_VERSION}" internal)
+endif()
+
+# Find libcudacxx prior to locating backend-specific deps. This ensures that CUB
+# finds the same package.
+if (NOT TARGET Thrust::libcudacxx)
+  thrust_debug("Searching for libcudacxx REQUIRED" internal)
+
+  # First do a non-required search for any co-packaged versions.
+  # These are preferred.
+  find_package(libcudacxx ${thrust_libcudacxx_version} CONFIG
+    ${_THRUST_QUIET_FLAG}
+    NO_DEFAULT_PATH # Only check the explicit HINTS below:
+    HINTS
+      "${_THRUST_INCLUDE_DIR}/dependencies/libcudacxx" # Source layout (GitHub)
+      "${_THRUST_INCLUDE_DIR}/../libcudacxx"           # Source layout (Perforce)
+      "${_THRUST_CMAKE_DIR}/.."                        # Install layout
+  )
+
+  # A second required search allows externally packaged to be used and fails if
+  # no suitable package exists.
+  find_package(libcudacxx ${thrust_libcudacxx_version} CONFIG
+    REQUIRED
+    ${_THRUST_QUIET_FLAG}
+  )
+
+  if (TARGET libcudacxx::libcudacxx)
+    _thrust_set_libcudacxx_target(libcudacxx::libcudacxx)
+  else()
+    thrust_debug("Expected libcudacxx::libcudacxx target not found!" internal)
+  endif()
+
+  target_link_libraries(_Thrust_Thrust INTERFACE Thrust::libcudacxx)
+endif()
+
+# Handle find_package COMPONENT requests:
+foreach(component ${${CMAKE_FIND_PACKAGE_NAME}_FIND_COMPONENTS})
+  if (NOT component IN_LIST THRUST_HOST_SYSTEM_OPTIONS AND
+      NOT component IN_LIST THRUST_DEVICE_SYSTEM_OPTIONS)
+    message(FATAL_ERROR "Invalid component requested: '${component}'")
+  endif()
+
+  unset(req)
+  if (${CMAKE_FIND_PACKAGE_NAME}_FIND_REQUIRED_${component})
+    set(req "REQUIRED")
+  endif()
+
+  thrust_debug("Preloading COMPONENT '${component}' ${req}" internal)
+  _thrust_find_backend(${component} "${req}")
+endforeach()
+
+thrust_update_system_found_flags()
+
+include(FindPackageHandleStandardArgs)
+if (NOT Thrust_CONFIG)
+  set(Thrust_CONFIG "${CMAKE_CURRENT_LIST_FILE}")
+endif()
+find_package_handle_standard_args(Thrust CONFIG_MODE)
diff --git a/thrust/cmake/thrust-header-search.cmake b/thrust/cmake/thrust-header-search.cmake
new file mode 100644
index 000000000..3d69398a7
--- /dev/null
+++ b/thrust/cmake/thrust-header-search.cmake
@@ -0,0 +1,6 @@
+# Parse version information from version.h in source tree
+set(_THRUST_VERSION_INCLUDE_DIR "${CMAKE_CURRENT_LIST_DIR}/../..")
+if(EXISTS "${_THRUST_VERSION_INCLUDE_DIR}/thrust/version.h")
+  set(_THRUST_VERSION_INCLUDE_DIR "${_THRUST_VERSION_INCLUDE_DIR}" CACHE FILEPATH "" FORCE) # Clear old result
+  set_property(CACHE _THRUST_VERSION_INCLUDE_DIR PROPERTY TYPE INTERNAL)
+endif()
diff --git a/thrust/cmake/thrust-header-search.cmake.in b/thrust/cmake/thrust-header-search.cmake.in
new file mode 100644
index 000000000..c014c469b
--- /dev/null
+++ b/thrust/cmake/thrust-header-search.cmake.in
@@ -0,0 +1,18 @@
+# Parse version information from version.h:
+unset(_THRUST_VERSION_INCLUDE_DIR CACHE) # Clear old result to force search
+
+# Find CMAKE_INSTALL_INCLUDEDIR=@CMAKE_INSTALL_INCLUDEDIR@ directory"
+set(from_install_prefix "@install_location@")
+
+# Transform to a list of directories, replace each directoy with "../"
+# and convert back to a string
+string(REGEX REPLACE "/" ";" from_install_prefix "${from_install_prefix}")
+list(TRANSFORM from_install_prefix REPLACE ".+" "../")
+list(JOIN from_install_prefix "" from_install_prefix)
+
+find_path(_THRUST_VERSION_INCLUDE_DIR thrust/version.h
+  NO_DEFAULT_PATH # Only search explicit paths below:
+  PATHS
+    "${CMAKE_CURRENT_LIST_DIR}/${from_install_prefix}/@CMAKE_INSTALL_INCLUDEDIR@"
+)
+set_property(CACHE _THRUST_VERSION_INCLUDE_DIR PROPERTY TYPE INTERNAL)
diff --git a/thrust/complex.h b/thrust/complex.h
index cd21f2409..8c0be0d61 100644
--- a/thrust/complex.h
+++ b/thrust/complex.h
@@ -44,8 +44,7 @@
 #  define THRUST_STD_COMPLEX_DEVICE
 #endif
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 /*
  *  Calls to the standard math library from inside the thrust namespace
@@ -63,13 +62,16 @@ namespace thrust
  *  \{
  */
 
+/*! \cond
+ */
+
 namespace detail
 {
-  
+
 template <typename T, std::size_t Align>
 struct complex_storage;
 
-#if __cplusplus >= 201103L                                                    \
+#if THRUST_CPP_DIALECT >= 2011                                                    \
   && (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC)                       \
   && (THRUST_GCC_VERSION >= 40800)
   // C++11 implementation, excluding GCC 4.7, which doesn't have `alignas`.
@@ -82,9 +84,9 @@ struct complex_storage;
     || (   (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC)                 \
         && (THRUST_GCC_VERSION < 40600))
   // C++03 implementation for MSVC and GCC <= 4.5.
-  // 
+  //
   // We have to implement `aligned_type` with specializations for MSVC
-  // and GCC 4.2 and older because they require literals as arguments to 
+  // and GCC 4.2 and older because they require literals as arguments to
   // their alignment attribute.
 
   #if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC)
@@ -115,7 +117,7 @@ struct complex_storage;
   {
     T x; T y;
   };
-  
+
   THRUST_DEFINE_COMPLEX_STORAGE_SPECIALIZATION(1);
   THRUST_DEFINE_COMPLEX_STORAGE_SPECIALIZATION(2);
   THRUST_DEFINE_COMPLEX_STORAGE_SPECIALIZATION(4);
@@ -137,14 +139,17 @@ struct complex_storage;
 
 } // end namespace detail
 
-  /*! \p complex is the Thrust equivalent to <tt>std::complex</tt>. It is
-   *  functionally identical to it, but can also be used in device code which
-   *  <tt>std::complex</tt> currently cannot.
-   *
-   *  \tparam T The type used to hold the real and imaginary parts. Should be
-   *  <tt>float</tt> or <tt>double</tt>. Others types are not supported.
-   *
-   */
+/*! \endcond
+ */
+
+/*! \p complex is the Thrust equivalent to <tt>std::complex</tt>. It is
+ *  functionally identical to it, but can also be used in device code which
+ *  <tt>std::complex</tt> currently cannot.
+ *
+ *  \tparam T The type used to hold the real and imaginary parts. Should be
+ *  <tt>float</tt> or <tt>double</tt>. Others types are not supported.
+ *
+ */
 template <typename T>
 struct complex
 {
@@ -1026,7 +1031,7 @@ template <typename T0, typename T1>
 __host__ __device__
 bool operator!=(const complex<T0>& x, const T1& y);
 
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/complex/complex.inl>
 
diff --git a/thrust/copy.h b/thrust/copy.h
index 23365875d..99d488174 100644
--- a/thrust/copy.h
+++ b/thrust/copy.h
@@ -24,8 +24,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/detail/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup algorithms
  */
@@ -54,11 +53,11 @@ namespace thrust
  *  \param last The end of the sequence to copy.
  *  \param result The destination sequence.
  *  \return The end of the destination sequence.
- *  \see http://www.sgi.com/tech/stl/copy.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/copy
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator's \c value_type must be convertible to \c OutputIterator's \c value_type.
- *  \tparam OutputIterator must be a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam InputIterator must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a> and \c InputIterator's \c value_type must be convertible to \c OutputIterator's \c value_type.
+ *  \tparam OutputIterator must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *
  *  \pre \p result may be equal to \p first, but \p result shall not be in the range <tt>[first, last)</tt> otherwise.
  *
@@ -107,9 +106,9 @@ __host__ __device__
  *  \return The end of the destination range.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator's \c value_type must be convertible to \c OutputIterator's \c value_type.
+ *  \tparam InputIterator must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a> and \c InputIterator's \c value_type must be convertible to \c OutputIterator's \c value_type.
  *  \tparam Size is an integral type.
- *  \tparam OutputIterator must be a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *
  *  \pre \p result may be equal to \p first, but \p result shall not be in the range <tt>[first, first + n)</tt> otherwise.
  *
@@ -130,7 +129,7 @@ __host__ __device__
  *  // vec1 is now a copy of vec0
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/copy_n.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/copy_n
  *  \see thrust::copy
  */
 template<typename DerivedPolicy, typename InputIterator, typename Size, typename OutputIterator>
@@ -157,10 +156,10 @@ __host__ __device__
  *  \param last The end of the sequence to copy.
  *  \param result The destination sequence.
  *  \return The end of the destination sequence.
- *  \see http://www.sgi.com/tech/stl/copy.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/copy
  *
- *  \tparam InputIterator must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator's \c value_type must be convertible to \c OutputIterator's \c value_type.
- *  \tparam OutputIterator must be a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam InputIterator must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a> and \c InputIterator's \c value_type must be convertible to \c OutputIterator's \c value_type.
+ *  \tparam OutputIterator must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *
  *  \pre \p result may be equal to \p first, but \p result shall not be in the range <tt>[first, last)</tt> otherwise.
  *
@@ -202,9 +201,9 @@ template<typename InputIterator, typename OutputIterator>
  *  \param result The beginning destination range.
  *  \return The end of the destination range.
  *
- *  \tparam InputIterator must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator's \c value_type must be convertible to \c OutputIterator's \c value_type.
+ *  \tparam InputIterator must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a> and \c InputIterator's \c value_type must be convertible to \c OutputIterator's \c value_type.
  *  \tparam Size is an integral type.
- *  \tparam OutputIterator must be a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *
  *  \pre \p result may be equal to \p first, but \p result shall not be in the range <tt>[first, first + n)</tt> otherwise.
  *
@@ -224,7 +223,7 @@ template<typename InputIterator, typename OutputIterator>
  *  // vec1 is now a copy of vec0
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/copy_n.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/copy_n
  *  \see thrust::copy
  */
 template<typename InputIterator, typename Size, typename OutputIterator>
@@ -261,10 +260,10 @@ template<typename InputIterator, typename Size, typename OutputIterator>
  *          evaluated to \c true in the range <tt>[first, last)</tt>.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *                        and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \pre The ranges <tt>[first, last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
  *
@@ -323,10 +322,10 @@ __host__ __device__
  *  \return <tt>result + n</tt>, where \c n is equal to the number of times \p pred
  *          evaluated to \c true in the range <tt>[first, last)</tt>.
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *                        and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \pre The ranges <tt>[first, last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
  *
@@ -388,11 +387,11 @@ template<typename InputIterator,
  *          evaluated to \c true in the range <tt>[stencil, stencil + (last-first))</tt>.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>.
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *                         and \p InputIterator2's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator">Output Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/named_req/OutputIterator">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \pre The ranges <tt>[first, last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
  *  \pre The ranges <tt>[stencil, stencil + (last - first))</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
@@ -455,11 +454,11 @@ __host__ __device__
  *  \return <tt>result + n</tt>, where \c n is equal to the number of times \p pred
  *          evaluated to \c true in the range <tt>[stencil, stencil + (last-first))</tt>.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>.
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *                         and \p InputIterator2's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator">Output Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/named_req/OutputIterator">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \pre The ranges <tt>[first, last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
  *  \pre The ranges <tt>[stencil, stencil + (last - first))</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
@@ -505,8 +504,8 @@ template<typename InputIterator1,
 
 /*! \} // end stream_compaction
  */
-	
-} // end namespace thrust
+
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/copy.h>
 #include <thrust/detail/copy_if.h>
diff --git a/thrust/count.h b/thrust/count.h
index 9225bc6a7..abf8b2d6c 100644
--- a/thrust/count.h
+++ b/thrust/count.h
@@ -25,9 +25,7 @@
 #include <thrust/detail/execution_policy.h>
 #include <thrust/iterator/iterator_traits.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup algorithms
  */
@@ -56,8 +54,8 @@ namespace thrust
  *  \return The number of elements equal to \p value.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator's \c value_type must be a model of must be a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
- *  \tparam EqualityComparable must be a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a> and can be compared for equality with \c InputIterator's \c value_type
+ *  \tparam InputIterator must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a> and \c InputIterator's \c value_type must be a model of must be a model of <a href="https://en.cppreference.com/w/cpp/concepts/equality_comparable">Equality Comparable</a>.
+ *  \tparam EqualityComparable must be a model of <a href="https://en.cppreference.com/w/cpp/concepts/equality_comparable">Equality Comparable</a> and can be compared for equality with \c InputIterator's \c value_type
  *
  *  The following code snippet demonstrates how to use \p count to 
  *  count the number of instances in a range of a value of interest using the \p thrust::device execution policy:
@@ -78,7 +76,7 @@ namespace thrust
  *  // result == 3
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/count.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/count
  */
 template<typename DerivedPolicy, typename InputIterator, typename EqualityComparable>
 __host__ __device__
@@ -96,8 +94,8 @@ __host__ __device__
  *  \param value The value to be counted.
  *  \return The number of elements equal to \p value.
  *
- *  \tparam InputIterator must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator's \c value_type must be a model of must be a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
- *  \tparam EqualityComparable must be a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a> and can be compared for equality with \c InputIterator's \c value_type
+ *  \tparam InputIterator must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a> and \c InputIterator's \c value_type must be a model of must be a model of <a href="https://en.cppreference.com/w/cpp/concepts/equality_comparable">Equality Comparable</a>.
+ *  \tparam EqualityComparable must be a model of <a href="https://en.cppreference.com/w/cpp/concepts/equality_comparable">Equality Comparable</a> and can be compared for equality with \c InputIterator's \c value_type
  *
  *  The following code snippet demonstrates how to use \p count to 
  *  count the number of instances in a range of a value of interest.
@@ -116,7 +114,7 @@ __host__ __device__
  *  // result == 3
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/count.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/count
  */
 template <typename InputIterator, typename EqualityComparable>
   typename thrust::iterator_traits<InputIterator>::difference_type
@@ -136,8 +134,8 @@ template <typename InputIterator, typename EqualityComparable>
  *  \return The number of elements where \p pred is \c true.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator's \c value_type must be convertible to \c Predicate's \c argument_type.
- *  \tparam Predicate must be a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam InputIterator must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a> and \c InputIterator's \c value_type must be convertible to \c Predicate's \c argument_type.
+ *  \tparam Predicate must be a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  The following code snippet demonstrates how to use \p count to
  *  count the number of odd numbers in a range using the \p thrust::device execution policy:
@@ -169,7 +167,7 @@ template <typename InputIterator, typename EqualityComparable>
  *  // result == 2
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/count.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/count
  */
 template<typename DerivedPolicy, typename InputIterator, typename Predicate>
 __host__ __device__
@@ -186,8 +184,8 @@ __host__ __device__
  *  \param pred The predicate.
  *  \return The number of elements where \p pred is \c true.
  *
- *  \tparam InputIterator must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator's \c value_type must be convertible to \c Predicate's \c argument_type.
- *  \tparam Predicate must be a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam InputIterator must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a> and \c InputIterator's \c value_type must be convertible to \c Predicate's \c argument_type.
+ *  \tparam Predicate must be a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  The following code snippet demonstrates how to use \p count to
  *  count the number of odd numbers in a range.
@@ -217,7 +215,7 @@ __host__ __device__
  *  // result == 2
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/count.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/count
  */
 template <typename InputIterator, typename Predicate>
   typename thrust::iterator_traits<InputIterator>::difference_type
@@ -228,8 +226,6 @@ template <typename InputIterator, typename Predicate>
  *  \} // end reductions
  */
 
+THRUST_NAMESPACE_END
 
-} // end thrust
-
-#include <thrust/detail/count.inl>
-
+#include <thrust/detail/count.h>
diff --git a/thrust/detail/adjacent_difference.inl b/thrust/detail/adjacent_difference.inl
index f8099450f..844687cff 100644
--- a/thrust/detail/adjacent_difference.inl
+++ b/thrust/detail/adjacent_difference.inl
@@ -14,25 +14,20 @@
  *  limitations under the License.
  */
 
-
-/*! \file adjacent_difference.inl
- *  \brief Inline file for adjacent_difference.h
- */
+#pragma once
 
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/select_system.h>
 #include <thrust/system/detail/generic/adjacent_difference.h>
 #include <thrust/system/detail/adl/adjacent_difference.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
-__thrust_exec_check_disable__ 
+__thrust_exec_check_disable__
 template <typename DerivedPolicy, typename InputIterator, typename OutputIterator>
 __host__ __device__
 OutputIterator adjacent_difference(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                   InputIterator first, InputIterator last, 
+                                   InputIterator first, InputIterator last,
                                    OutputIterator result)
 {
   using thrust::system::detail::generic::adjacent_difference;
@@ -41,11 +36,11 @@ OutputIterator adjacent_difference(const thrust::detail::execution_policy_base<D
 } // end adjacent_difference()
 
 
-__thrust_exec_check_disable__ 
+__thrust_exec_check_disable__
 template <typename DerivedPolicy, typename InputIterator, typename OutputIterator, typename BinaryFunction>
 __host__ __device__
 OutputIterator adjacent_difference(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                   InputIterator first, InputIterator last, 
+                                   InputIterator first, InputIterator last,
                                    OutputIterator result,
                                    BinaryFunction binary_op)
 {
@@ -56,7 +51,7 @@ OutputIterator adjacent_difference(const thrust::detail::execution_policy_base<D
 
 
 template <typename InputIterator, typename OutputIterator>
-OutputIterator adjacent_difference(InputIterator first, InputIterator last, 
+OutputIterator adjacent_difference(InputIterator first, InputIterator last,
                                    OutputIterator result)
 {
   using thrust::system::detail::generic::select_system;
@@ -88,5 +83,4 @@ OutputIterator adjacent_difference(InputIterator first, InputIterator last,
 } // end adjacent_difference()
 
 
-} // end namespace thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/advance.inl b/thrust/detail/advance.inl
index 2694a7ec6..7b5f261bd 100644
--- a/thrust/detail/advance.inl
+++ b/thrust/detail/advance.inl
@@ -14,10 +14,7 @@
  *  limitations under the License.
  */
 
-
-/*! \file advance.inl
- *  \brief Inline file for advance.h
- */
+#pragma once
 
 #include <thrust/detail/config.h>
 #include <thrust/advance.h>
@@ -27,8 +24,7 @@
 #include <thrust/detail/type_traits/has_nested_type.h>
 #include <thrust/detail/type_traits/pointer_traits.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 __THRUST_DEFINE_HAS_NESTED_TYPE(has_difference_type, difference_type)
 
@@ -75,5 +71,4 @@ typename detail::disable_if<
   return i;
 }
 
-} // namespace thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/algorithm_wrapper.h b/thrust/detail/algorithm_wrapper.h
new file mode 100644
index 000000000..c09b9a0a0
--- /dev/null
+++ b/thrust/detail/algorithm_wrapper.h
@@ -0,0 +1,27 @@
+/*
+ *  Copyright 2020 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+// When a compiler uses Thrust as part of its implementation of Standard C++
+// algorithms, a cycle of included files may result when Thrust code tries to
+// use a standard algorithm.  Having a macro that is defined only when Thrust
+// is including an algorithms-related header gives the compiler a chance to
+// detect and break the cycle of includes.
+
+#define THRUST_INCLUDING_ALGORITHMS_HEADER
+#include <algorithm>
+#undef  THRUST_INCLUDING_ALGORITHMS_HEADER
diff --git a/thrust/detail/alignment.h b/thrust/detail/alignment.h
index c787b0a13..08f73501e 100644
--- a/thrust/detail/alignment.h
+++ b/thrust/detail/alignment.h
@@ -25,12 +25,12 @@
 
 #include <cstddef> // For `std::size_t` and `std::max_align_t`.
 
-#if __cplusplus >= 201103L
+#if THRUST_CPP_DIALECT >= 2011
     #include <type_traits> // For `std::alignment_of` and `std::aligned_storage`.
 #endif
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
+
 namespace detail
 {
 
@@ -43,7 +43,7 @@ namespace detail
 /// inside of a `__declspec(align(#))` attribute. As a workaround, you can
 /// assign the result of \p THRUST_ALIGNOF to a variable and pass the variable
 /// as the argument to `__declspec(align(#))`.
-#if __cplusplus >= 201103L
+#if THRUST_CPP_DIALECT >= 2011
     #define THRUST_ALIGNOF(x) alignof(x) 
 #else
     #define THRUST_ALIGNOF(x) __alignof(x)
@@ -54,7 +54,7 @@ namespace detail
 /// expression.
 /// 
 /// It is an implementation of C++11's \p std::alignment_of.
-#if __cplusplus >= 201103L
+#if THRUST_CPP_DIALECT >= 2011
     template <typename T>
     using alignment_of = std::alignment_of<T>;
 #else
@@ -97,7 +97,7 @@ namespace detail
 template <std::size_t Align>
 struct aligned_type;
 
-#if __cplusplus >= 201103L                                                     \
+#if THRUST_CPP_DIALECT >= 2011                                                     \
   && (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC)                        \
   && (THRUST_GCC_VERSION >= 40800)
     // C++11 implementation, excluding GCC 4.7, which doesn't have `alignas`.
@@ -161,7 +161,7 @@ struct aligned_type;
 /// The behavior is undefined if `Len` is 0 or `Align` is not a power of 2.
 ///
 /// It is an implementation of C++11's \p std::aligned_storage.
-#if __cplusplus >= 201103L
+#if THRUST_CPP_DIALECT >= 2011
     template <std::size_t Len, std::size_t Align>
     using aligned_storage = std::aligned_storage<Len, Align>;
 #else
@@ -184,7 +184,7 @@ struct aligned_type;
 /// strict (as large) as that of every scalar type.
 ///
 /// It is an implementation of C++11's \p std::max_align_t.
-#if __cplusplus >= 201103L                                                     \
+#if THRUST_CPP_DIALECT >= 2011                                                     \
   && (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC)                        \
   && (THRUST_GCC_VERSION >= 40900)
     // GCC 4.7 and 4.8 don't have `std::max_align_t`.
@@ -226,5 +226,5 @@ inline std::size_t aligned_storage_size(std::size_t n, std::size_t align)
 }
 
 } // end namespace detail
-} // end namespace thrust
 
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/allocator/allocator_traits.h b/thrust/detail/allocator/allocator_traits.h
index 36f56b8c8..3a5af3661 100644
--- a/thrust/detail/allocator/allocator_traits.h
+++ b/thrust/detail/allocator/allocator_traits.h
@@ -26,8 +26,9 @@
 #include <thrust/detail/type_traits/has_member_function.h>
 #include <thrust/detail/type_traits.h>
 
-namespace thrust
-{
+#include <thrust/detail/memory_wrapper.h>
+
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 
@@ -71,6 +72,25 @@ template<typename Alloc, typename U>
   typedef thrust::detail::integral_constant<bool, value> type;
 };
 
+// The following fields of std::allocator have been deprecated (since C++17).
+// There's no way to detect it other than explicit specialization.
+#if THRUST_CPP_DIALECT >= 2017
+#define THRUST_SPECIALIZE_DEPRECATED(trait_name)                               \
+template <typename T>                                                          \
+struct trait_name<std::allocator<T>> : false_type {};
+
+THRUST_SPECIALIZE_DEPRECATED(has_is_always_equal)
+THRUST_SPECIALIZE_DEPRECATED(has_pointer)
+THRUST_SPECIALIZE_DEPRECATED(has_const_pointer)
+THRUST_SPECIALIZE_DEPRECATED(has_reference)
+THRUST_SPECIALIZE_DEPRECATED(has_const_reference)
+
+#undef THRUST_SPECIALIZE_DEPRECATED
+
+template<typename T, typename U>
+struct has_rebind<std::allocator<T>, U> : false_type {};
+#endif
+
 template<typename T>
   struct nested_pointer
 {
@@ -164,7 +184,7 @@ template<class Alloc, class U, bool = has_rebind<Alloc, U>::value>
     typedef typename Alloc::template rebind<U>::other type;
 };
 
-#if __cplusplus >= 201103L
+#if THRUST_CPP_DIALECT >= 2011
 template<template<typename, typename...> class Alloc,
          typename T, typename... Args, typename U>
   struct rebind_alloc<Alloc<T, Args...>, U, true>
@@ -347,6 +367,10 @@ template<typename Alloc>
   };
 #endif
 
+  // Deprecated std::allocator typedefs that we need:
+  typedef typename thrust::detail::pointer_traits<pointer>::reference reference;
+  typedef typename thrust::detail::pointer_traits<const_pointer>::reference const_reference;
+
   inline __host__ __device__
   static pointer allocate(allocator_type &a, size_type n);
 
@@ -412,7 +436,7 @@ template<typename Alloc>
 
 
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/allocator/allocator_traits.inl>
 
diff --git a/thrust/detail/allocator/allocator_traits.inl b/thrust/detail/allocator/allocator_traits.inl
index 1b3da43d9..275330094 100644
--- a/thrust/detail/allocator/allocator_traits.inl
+++ b/thrust/detail/allocator/allocator_traits.inl
@@ -14,6 +14,8 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/detail/allocator/allocator_traits.h>
 #include <thrust/detail/type_traits/is_call_possible.h>
@@ -23,12 +25,110 @@
   #include <thrust/detail/type_deduction.h>
 #endif
 
+#include <thrust/detail/memory_wrapper.h>
 #include <new>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
+
+#if THRUST_CPP_DIALECT >= 2011
+
+// std::allocator's member functions are deprecated in C++17 and removed in
+// C++20, so we can't just use the generic implementation for allocator_traits
+// that calls the allocator's member functions.
+// Instead, specialize allocator_traits for std::allocator and defer to
+// std::allocator_traits<std::allocator> and let the STL do whatever it needs
+// to for the current c++ version. Manually forward the calls to suppress
+// host/device warnings.
+template <typename T>
+struct allocator_traits<std::allocator<T>>
+  : public std::allocator_traits<std::allocator<T>>
+{
+private:
+  using superclass = std::allocator_traits<std::allocator<T>>;
+
+public:
+  using allocator_type = typename superclass::allocator_type;
+  using value_type = typename superclass::value_type;
+  using pointer = typename superclass::pointer;
+  using const_pointer = typename superclass::const_pointer;
+  using void_pointer = typename superclass::void_pointer;
+  using const_void_pointer = typename superclass::const_void_pointer;
+  using difference_type = typename superclass::difference_type;
+  using size_type = typename superclass::size_type;
+  using propagate_on_container_swap = typename superclass::propagate_on_container_swap;
+  using propagate_on_container_copy_assignment =
+    typename superclass::propagate_on_container_copy_assignment;
+  using propagate_on_container_move_assignment =
+    typename superclass::propagate_on_container_move_assignment;
+
+  // std::allocator_traits added this in C++17, but thrust::allocator_traits defines
+  // it unconditionally.
+  using is_always_equal = typename eval_if<
+      allocator_traits_detail::has_is_always_equal<allocator_type>::value,
+      allocator_traits_detail::nested_is_always_equal<allocator_type>,
+      is_empty<allocator_type>
+    >::type;
+
+  // std::allocator_traits doesn't provide these, but
+  // thrust::detail::allocator_traits does. These used to be part of the
+  // std::allocator API but were deprecated in C++17.
+  using reference = typename thrust::detail::pointer_traits<pointer>::reference;
+  using const_reference = typename thrust::detail::pointer_traits<const_pointer>::reference;
+
+  template <typename U>
+  using rebind_alloc = std::allocator<U>;
+  template <typename U>
+  using rebind_traits = allocator_traits<std::allocator<U>>;
+
+  __thrust_exec_check_disable__
+  __host__ __device__
+  static pointer allocate(allocator_type &a, size_type n)
+  {
+    return superclass::allocate(a, n);
+  }
+
+  __thrust_exec_check_disable__
+  __host__ __device__
+  static pointer allocate(allocator_type &a, size_type n, const_void_pointer hint)
+  {
+    return superclass::allocate(a, n, hint);
+  }
+
+  __thrust_exec_check_disable__
+  __host__ __device__
+  static void deallocate(allocator_type &a, pointer p, size_type n)
+  {
+    superclass::deallocate(a, p, n);
+  }
+
+  __thrust_exec_check_disable__
+  template <typename U, typename ...Args>
+  __host__ __device__
+  static void construct(allocator_type &a, U *p, Args&&... args)
+  {
+    superclass::construct(a, p, THRUST_FWD(args)...);
+  }
+
+  __thrust_exec_check_disable__
+  template <typename U>
+  __host__ __device__
+  static void destroy(allocator_type &a, U *p)
+  {
+    superclass::destroy(a, p);
+  }
+
+  __thrust_exec_check_disable__
+  __host__ __device__
+  static size_type max_size(const allocator_type &a)
+  {
+    return superclass::max_size(a);
+  }
+};
+
+#endif //  C++11
+
 namespace allocator_traits_detail
 {
 
@@ -87,6 +187,7 @@ template<typename Alloc, typename T>
   a.construct(p);
 }
 
+__thrust_exec_check_disable__
 template<typename Alloc, typename T>
   inline __host__ __device__
     typename disable_if<
@@ -245,9 +346,8 @@ __host__ __device__
   >::type
     system(Alloc &)
 {
-  // return a copy of a default-constructed system
-  typename allocator_system<Alloc>::type result;
-  return result;
+  // return a copy of a value-initialized system
+  return typename allocator_system<Alloc>::type();
 }
 
 
@@ -263,7 +363,7 @@ __host__ __device__
   struct workaround_warnings
   {
     __thrust_exec_check_disable__
-    static __host__ __device__ 
+    static __host__ __device__
     typename allocator_traits<Alloc>::pointer
       allocate(Alloc &a, typename allocator_traits<Alloc>::size_type n)
     {
@@ -361,5 +461,5 @@ __host__ __device__
 
 
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/allocator/copy_construct_range.h b/thrust/detail/allocator/copy_construct_range.h
index 491c8ef41..b3c2de324 100644
--- a/thrust/detail/allocator/copy_construct_range.h
+++ b/thrust/detail/allocator/copy_construct_range.h
@@ -19,8 +19,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/detail/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 
@@ -41,7 +40,7 @@ __host__ __device__
                                  Pointer result);
 
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/allocator/copy_construct_range.inl>
 
diff --git a/thrust/detail/allocator/copy_construct_range.inl b/thrust/detail/allocator/copy_construct_range.inl
index 4bc7f5dfb..a71cca1f7 100644
--- a/thrust/detail/allocator/copy_construct_range.inl
+++ b/thrust/detail/allocator/copy_construct_range.inl
@@ -14,6 +14,8 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/detail/allocator/allocator_traits.h>
 #include <thrust/detail/type_traits/pointer_traits.h>
@@ -24,10 +26,9 @@
 #include <thrust/distance.h>
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/for_each.h>
-#include <memory>
+#include <thrust/detail/memory_wrapper.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 namespace allocator_traits_detail
@@ -305,5 +306,5 @@ __host__ __device__
 
 
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/allocator/default_construct_range.h b/thrust/detail/allocator/default_construct_range.h
index 6c3856c14..8b5026c05 100644
--- a/thrust/detail/allocator/default_construct_range.h
+++ b/thrust/detail/allocator/default_construct_range.h
@@ -18,8 +18,7 @@
 
 #include <thrust/detail/config.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 
@@ -30,7 +29,7 @@ inline void default_construct_range(Allocator &a, Pointer p, Size n);
 
 
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/allocator/default_construct_range.inl>
 
diff --git a/thrust/detail/allocator/default_construct_range.inl b/thrust/detail/allocator/default_construct_range.inl
index 0f65d4806..6d26578fa 100644
--- a/thrust/detail/allocator/default_construct_range.inl
+++ b/thrust/detail/allocator/default_construct_range.inl
@@ -14,6 +14,8 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/detail/allocator/allocator_traits.h>
 #include <thrust/detail/type_traits.h>
@@ -21,8 +23,7 @@
 #include <thrust/for_each.h>
 #include <thrust/uninitialized_fill.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 namespace allocator_traits_detail
@@ -58,7 +59,7 @@ template<typename Allocator, typename T>
 {};
 
 
-// we know that std::allocator::construct's only effect is to call T's 
+// we know that std::allocator::construct's only effect is to call T's
 // default constructor, so we needn't use it for default construction
 // unless T's constructor does something interesting
 template<typename U, typename T>
@@ -107,5 +108,5 @@ __host__ __device__
 
 
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/allocator/destroy_range.h b/thrust/detail/allocator/destroy_range.h
index bf00037ce..cfc7e3f6e 100644
--- a/thrust/detail/allocator/destroy_range.h
+++ b/thrust/detail/allocator/destroy_range.h
@@ -18,8 +18,7 @@
 
 #include <thrust/detail/config.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 
@@ -28,7 +27,7 @@ __host__ __device__
   inline void destroy_range(Allocator &a, Pointer p, Size n);
 
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/allocator/destroy_range.inl>
 
diff --git a/thrust/detail/allocator/destroy_range.inl b/thrust/detail/allocator/destroy_range.inl
index d64745766..662177f3a 100644
--- a/thrust/detail/allocator/destroy_range.inl
+++ b/thrust/detail/allocator/destroy_range.inl
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2021 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -14,14 +14,17 @@
  *  limitations under the License.
  */
 
+#pragma once
+
+#include <thrust/detail/config.h>
+
 #include <thrust/detail/allocator/destroy_range.h>
 #include <thrust/detail/allocator/allocator_traits.h>
 #include <thrust/detail/type_traits/pointer_traits.h>
 #include <thrust/for_each.h>
-#include <memory>
+#include <thrust/detail/memory_wrapper.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 namespace allocator_traits_detail
@@ -160,5 +163,5 @@ __host__ __device__
 
 
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/allocator/fill_construct_range.h b/thrust/detail/allocator/fill_construct_range.h
index 9de0f7bcb..a7572cb2d 100644
--- a/thrust/detail/allocator/fill_construct_range.h
+++ b/thrust/detail/allocator/fill_construct_range.h
@@ -18,8 +18,7 @@
 
 #include <thrust/detail/config.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 
@@ -30,7 +29,7 @@ inline void fill_construct_range(Allocator &a, Pointer p, Size n, const T &value
 
 
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/allocator/fill_construct_range.inl>
 
diff --git a/thrust/detail/allocator/fill_construct_range.inl b/thrust/detail/allocator/fill_construct_range.inl
index 2f966703f..876b5ddd2 100644
--- a/thrust/detail/allocator/fill_construct_range.inl
+++ b/thrust/detail/allocator/fill_construct_range.inl
@@ -14,16 +14,17 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/detail/type_traits.h>
 #include <thrust/detail/allocator/allocator_traits.h>
 #include <thrust/detail/type_traits/pointer_traits.h>
 #include <thrust/for_each.h>
 #include <thrust/uninitialized_fill.h>
-#include <memory>
+#include <thrust/detail/memory_wrapper.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 namespace allocator_traits_detail
@@ -109,5 +110,5 @@ __host__ __device__
 
 
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/allocator/malloc_allocator.h b/thrust/detail/allocator/malloc_allocator.h
index 2c01c66bd..af3d0fccb 100644
--- a/thrust/detail/allocator/malloc_allocator.h
+++ b/thrust/detail/allocator/malloc_allocator.h
@@ -20,8 +20,7 @@
 #include <thrust/detail/type_traits/pointer_traits.h>
 #include <thrust/detail/allocator/tagged_allocator.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 
@@ -46,7 +45,7 @@ template<typename T, typename System, typename Pointer>
 };
 
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/allocator/malloc_allocator.inl>
 
diff --git a/thrust/detail/allocator/malloc_allocator.inl b/thrust/detail/allocator/malloc_allocator.inl
index e7b7503ba..d03d33305 100644
--- a/thrust/detail/allocator/malloc_allocator.inl
+++ b/thrust/detail/allocator/malloc_allocator.inl
@@ -14,6 +14,8 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/detail/allocator/malloc_allocator.h>
 #include <thrust/system/detail/generic/select_system.h>
@@ -21,8 +23,7 @@
 #include <thrust/detail/raw_pointer_cast.h>
 #include <thrust/detail/malloc_and_free.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 
@@ -60,5 +61,5 @@ template<typename T, typename System, typename Pointer>
 
 
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/allocator/no_throw_allocator.h b/thrust/detail/allocator/no_throw_allocator.h
index ba8c3d852..a6c16985b 100644
--- a/thrust/detail/allocator/no_throw_allocator.h
+++ b/thrust/detail/allocator/no_throw_allocator.h
@@ -18,8 +18,9 @@
 
 #include <thrust/detail/config.h>
 
-namespace thrust
-{
+#include <nv/target>
+
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 
@@ -44,18 +45,18 @@ template<typename BaseAllocator>
     __host__ __device__
     void deallocate(typename super_t::pointer p, typename super_t::size_type n)
     {
-#ifndef __CUDA_ARCH__
-      try
-      {
+      NV_IF_TARGET(NV_IS_HOST, (
+        try
+        {
+          super_t::deallocate(p, n);
+        } // end try
+        catch(...)
+        {
+          // catch anything
+        } // end catch
+      ), (
         super_t::deallocate(p, n);
-      } // end try
-      catch(...)
-      {
-        // catch anything
-      } // end catch
-#else
-      super_t::deallocate(p, n);
-#endif
+      ));
     } // end deallocate()
 
     inline __host__ __device__
@@ -66,6 +67,6 @@ template<typename BaseAllocator>
 }; // end no_throw_allocator
 
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
 
diff --git a/thrust/detail/allocator/tagged_allocator.h b/thrust/detail/allocator/tagged_allocator.h
index a29115c6c..804c4e42e 100644
--- a/thrust/detail/allocator/tagged_allocator.h
+++ b/thrust/detail/allocator/tagged_allocator.h
@@ -20,8 +20,7 @@
 #include <thrust/detail/type_traits/pointer_traits.h>
 #include <thrust/iterator/iterator_traits.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 
@@ -95,7 +94,7 @@ __host__ __device__
 bool operator!=(const tagged_allocator<T1,Pointer1,Tag> &, const tagged_allocator<T2,Pointer2,Tag> &);
 
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/allocator/tagged_allocator.inl>
 
diff --git a/thrust/detail/allocator/tagged_allocator.inl b/thrust/detail/allocator/tagged_allocator.inl
index 5f4ed9596..bcd534cbc 100644
--- a/thrust/detail/allocator/tagged_allocator.inl
+++ b/thrust/detail/allocator/tagged_allocator.inl
@@ -14,12 +14,13 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/detail/allocator/tagged_allocator.h>
 #include <limits>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 
@@ -96,8 +97,8 @@ bool operator!=(const tagged_allocator<T1,Pointer1,Tag> &, const tagged_allocato
 {
   return false;
 }
-    
+
 
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/allocator/temporary_allocator.h b/thrust/detail/allocator/temporary_allocator.h
index 4d2ac429c..c8ef60625 100644
--- a/thrust/detail/allocator/temporary_allocator.h
+++ b/thrust/detail/allocator/temporary_allocator.h
@@ -23,8 +23,7 @@
 #include <thrust/memory.h>
 #include <thrust/detail/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 
@@ -79,7 +78,7 @@ template<typename T, typename System>
 
 
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/allocator/temporary_allocator.inl>
 
diff --git a/thrust/detail/allocator/temporary_allocator.inl b/thrust/detail/allocator/temporary_allocator.inl
index d66d1290e..ef5d1afa5 100644
--- a/thrust/detail/allocator/temporary_allocator.inl
+++ b/thrust/detail/allocator/temporary_allocator.inl
@@ -14,18 +14,23 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/detail/allocator/temporary_allocator.h>
 #include <thrust/detail/temporary_buffer.h>
 #include <thrust/system/detail/bad_alloc.h>
 #include <cassert>
 
-#ifdef __CUDACC__
+#include <nv/target>
+
+#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+#if (defined(_NVHPC_CUDA) || defined(__CUDA_ARCH__))
 #include <thrust/system/cuda/detail/terminate.h>
-#endif
+#endif // NVCC device pass or NVC++
+#endif // CUDA
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 
@@ -45,10 +50,14 @@ __host__ __device__
     // note that we pass cnt to deallocate, not a value derived from result.second
     deallocate(result.first, cnt);
 
-#if !defined(__CUDA_ARCH__)
-    throw thrust::system::detail::bad_alloc("temporary_buffer::allocate: get_temporary_buffer failed");
+#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+    NV_IF_TARGET(NV_IS_HOST, (
+      throw thrust::system::detail::bad_alloc("temporary_buffer::allocate: get_temporary_buffer failed");
+    ), ( // NV_IS_DEVICE
+      thrust::system::cuda::detail::terminate_with_message("temporary_buffer::allocate: get_temporary_buffer failed");
+    ));
 #else
-    thrust::system::cuda::detail::terminate_with_message("temporary_buffer::allocate: get_temporary_buffer failed");
+    throw thrust::system::detail::bad_alloc("temporary_buffer::allocate: get_temporary_buffer failed");
 #endif
   } // end if
 
@@ -59,12 +68,12 @@ __host__ __device__
 template<typename T, typename System>
 __host__ __device__
   void temporary_allocator<T,System>
-    ::deallocate(typename temporary_allocator<T,System>::pointer p, typename temporary_allocator<T,System>::size_type)
+    ::deallocate(typename temporary_allocator<T,System>::pointer p, typename temporary_allocator<T,System>::size_type n)
 {
-  return thrust::return_temporary_buffer(system(), p);
+  return thrust::return_temporary_buffer(system(), p, n);
 } // end temporary_allocator
 
 
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/allocator_aware_execution_policy.h b/thrust/detail/allocator_aware_execution_policy.h
index 3a6eb071b..eea93c035 100644
--- a/thrust/detail/allocator_aware_execution_policy.h
+++ b/thrust/detail/allocator_aware_execution_policy.h
@@ -24,8 +24,7 @@
   #include <type_traits>
 #endif
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace mr
 {
@@ -83,7 +82,7 @@ struct allocator_aware_execution_policy
     return typename execute_with_allocator_type<Allocator>::type(alloc);
   }
 
-#if __cplusplus >= 201103L
+#if THRUST_CPP_DIALECT >= 2011
   // just the rvalue overload
   // perfect forwarding doesn't help, because a const reference has to be turned
   // into a value by copying for the purpose of storing it in execute_with_allocator
@@ -97,5 +96,6 @@ struct allocator_aware_execution_policy
 #endif
 };
 
-}
-}
+} // end namespace detail
+
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/binary_search.inl b/thrust/detail/binary_search.inl
index 5703226dc..90350ced4 100644
--- a/thrust/detail/binary_search.inl
+++ b/thrust/detail/binary_search.inl
@@ -14,10 +14,7 @@
  *  limitations under the License.
  */
 
-
-/*! \file binary_search.inl
- *  \brief Inline file for binary_search.h.
- */
+#pragma once
 
 #include <thrust/detail/config.h>
 #include <thrust/binary_search.h>
@@ -26,11 +23,9 @@
 #include <thrust/system/detail/generic/binary_search.h>
 #include <thrust/system/detail/adl/binary_search.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
-__thrust_exec_check_disable__ 
+__thrust_exec_check_disable__
 template <typename DerivedPolicy, typename ForwardIterator, typename LessThanComparable>
 __host__ __device__
 ForwardIterator lower_bound(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
@@ -43,7 +38,7 @@ ForwardIterator lower_bound(const thrust::detail::execution_policy_base<DerivedP
 }
 
 
-__thrust_exec_check_disable__ 
+__thrust_exec_check_disable__
 template<typename DerivedPolicy, typename ForwardIterator, typename T, typename StrictWeakOrdering>
 __host__ __device__
 ForwardIterator lower_bound(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
@@ -57,7 +52,7 @@ ForwardIterator lower_bound(const thrust::detail::execution_policy_base<DerivedP
 }
 
 
-__thrust_exec_check_disable__ 
+__thrust_exec_check_disable__
 template<typename DerivedPolicy, typename ForwardIterator, typename LessThanComparable>
 __host__ __device__
 ForwardIterator upper_bound(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
@@ -70,7 +65,7 @@ ForwardIterator upper_bound(const thrust::detail::execution_policy_base<DerivedP
 }
 
 
-__thrust_exec_check_disable__ 
+__thrust_exec_check_disable__
 template<typename DerivedPolicy, typename ForwardIterator, typename T, typename StrictWeakOrdering>
 __host__ __device__
 ForwardIterator upper_bound(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
@@ -84,11 +79,11 @@ ForwardIterator upper_bound(const thrust::detail::execution_policy_base<DerivedP
 }
 
 
-__thrust_exec_check_disable__ 
+__thrust_exec_check_disable__
 template <typename DerivedPolicy, typename ForwardIterator, typename LessThanComparable>
 __host__ __device__
 bool binary_search(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                   ForwardIterator first, 
+                   ForwardIterator first,
                    ForwardIterator last,
                    const LessThanComparable& value)
 {
@@ -97,13 +92,13 @@ bool binary_search(const thrust::detail::execution_policy_base<DerivedPolicy> &e
 }
 
 
-__thrust_exec_check_disable__ 
+__thrust_exec_check_disable__
 template <typename DerivedPolicy, typename ForwardIterator, typename T, typename StrictWeakOrdering>
 __host__ __device__
 bool binary_search(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
                    ForwardIterator first,
                    ForwardIterator last,
-                   const T& value, 
+                   const T& value,
                    StrictWeakOrdering comp)
 {
     using thrust::system::detail::generic::binary_search;
@@ -111,7 +106,7 @@ bool binary_search(const thrust::detail::execution_policy_base<DerivedPolicy> &e
 }
 
 
-__thrust_exec_check_disable__ 
+__thrust_exec_check_disable__
 template <typename DerivedPolicy, typename ForwardIterator, typename T, typename StrictWeakOrdering>
 __host__ __device__
 thrust::pair<ForwardIterator, ForwardIterator>
@@ -126,7 +121,7 @@ equal_range(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
 }
 
 
-__thrust_exec_check_disable__ 
+__thrust_exec_check_disable__
 template <typename DerivedPolicy, typename ForwardIterator, typename LessThanComparable>
 __host__ __device__
 thrust::pair<ForwardIterator, ForwardIterator>
@@ -140,13 +135,13 @@ equal_range(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
 }
 
 
-__thrust_exec_check_disable__ 
+__thrust_exec_check_disable__
 template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator>
 __host__ __device__
 OutputIterator lower_bound(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                           ForwardIterator first, 
+                           ForwardIterator first,
                            ForwardIterator last,
-                           InputIterator values_first, 
+                           InputIterator values_first,
                            InputIterator values_last,
                            OutputIterator output)
 {
@@ -155,13 +150,13 @@ OutputIterator lower_bound(const thrust::detail::execution_policy_base<DerivedPo
 }
 
 
-__thrust_exec_check_disable__ 
+__thrust_exec_check_disable__
 template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator, typename StrictWeakOrdering>
 __host__ __device__
 OutputIterator lower_bound(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                           ForwardIterator first, 
+                           ForwardIterator first,
                            ForwardIterator last,
-                           InputIterator values_first, 
+                           InputIterator values_first,
                            InputIterator values_last,
                            OutputIterator output,
                            StrictWeakOrdering comp)
@@ -171,13 +166,13 @@ OutputIterator lower_bound(const thrust::detail::execution_policy_base<DerivedPo
 }
 
 
-__thrust_exec_check_disable__ 
+__thrust_exec_check_disable__
 template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator>
 __host__ __device__
 OutputIterator upper_bound(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                           ForwardIterator first, 
+                           ForwardIterator first,
                            ForwardIterator last,
-                           InputIterator values_first, 
+                           InputIterator values_first,
                            InputIterator values_last,
                            OutputIterator output)
 {
@@ -186,13 +181,13 @@ OutputIterator upper_bound(const thrust::detail::execution_policy_base<DerivedPo
 }
 
 
-__thrust_exec_check_disable__ 
+__thrust_exec_check_disable__
 template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator, typename StrictWeakOrdering>
 __host__ __device__
 OutputIterator upper_bound(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                           ForwardIterator first, 
+                           ForwardIterator first,
                            ForwardIterator last,
-                           InputIterator values_first, 
+                           InputIterator values_first,
                            InputIterator values_last,
                            OutputIterator output,
                            StrictWeakOrdering comp)
@@ -202,13 +197,13 @@ OutputIterator upper_bound(const thrust::detail::execution_policy_base<DerivedPo
 }
 
 
-__thrust_exec_check_disable__ 
+__thrust_exec_check_disable__
 template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator>
 __host__ __device__
 OutputIterator binary_search(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                             ForwardIterator first, 
+                             ForwardIterator first,
                              ForwardIterator last,
-                             InputIterator values_first, 
+                             InputIterator values_first,
                              InputIterator values_last,
                              OutputIterator output)
 {
@@ -217,13 +212,13 @@ OutputIterator binary_search(const thrust::detail::execution_policy_base<Derived
 }
 
 
-__thrust_exec_check_disable__ 
+__thrust_exec_check_disable__
 template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator, typename StrictWeakOrdering>
 __host__ __device__
 OutputIterator binary_search(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                             ForwardIterator first, 
+                             ForwardIterator first,
                              ForwardIterator last,
-                             InputIterator values_first, 
+                             InputIterator values_first,
                              InputIterator values_last,
                              OutputIterator output,
                              StrictWeakOrdering comp)
@@ -238,13 +233,13 @@ OutputIterator binary_search(const thrust::detail::execution_policy_base<Derived
 //////////////////////
 
 template <typename ForwardIterator, typename LessThanComparable>
-ForwardIterator lower_bound(ForwardIterator first, 
+ForwardIterator lower_bound(ForwardIterator first,
                             ForwardIterator last,
                             const LessThanComparable& value)
 {
     using thrust::system::detail::generic::select_system;
 
-    typedef typename thrust::iterator_system<ForwardIterator>::type System; 
+    typedef typename thrust::iterator_system<ForwardIterator>::type System;
 
     System system;
 
@@ -254,12 +249,12 @@ ForwardIterator lower_bound(ForwardIterator first,
 template <typename ForwardIterator, typename T, typename StrictWeakOrdering>
 ForwardIterator lower_bound(ForwardIterator first,
                             ForwardIterator last,
-                            const T& value, 
+                            const T& value,
                             StrictWeakOrdering comp)
 {
     using thrust::system::detail::generic::select_system;
 
-    typedef typename thrust::iterator_system<ForwardIterator>::type System; 
+    typedef typename thrust::iterator_system<ForwardIterator>::type System;
 
     System system;
 
@@ -267,7 +262,7 @@ ForwardIterator lower_bound(ForwardIterator first,
 }
 
 template <typename ForwardIterator, typename LessThanComparable>
-ForwardIterator upper_bound(ForwardIterator first, 
+ForwardIterator upper_bound(ForwardIterator first,
                             ForwardIterator last,
                             const LessThanComparable& value)
 {
@@ -283,7 +278,7 @@ ForwardIterator upper_bound(ForwardIterator first,
 template <typename ForwardIterator, typename T, typename StrictWeakOrdering>
 ForwardIterator upper_bound(ForwardIterator first,
                             ForwardIterator last,
-                            const T& value, 
+                            const T& value,
                             StrictWeakOrdering comp)
 {
     using thrust::system::detail::generic::select_system;
@@ -296,7 +291,7 @@ ForwardIterator upper_bound(ForwardIterator first,
 }
 
 template <typename ForwardIterator, typename LessThanComparable>
-bool binary_search(ForwardIterator first, 
+bool binary_search(ForwardIterator first,
                    ForwardIterator last,
                    const LessThanComparable& value)
 {
@@ -312,7 +307,7 @@ bool binary_search(ForwardIterator first,
 template <typename ForwardIterator, typename T, typename StrictWeakOrdering>
 bool binary_search(ForwardIterator first,
                    ForwardIterator last,
-                   const T& value, 
+                   const T& value,
                    StrictWeakOrdering comp)
 {
     using thrust::system::detail::generic::select_system;
@@ -360,9 +355,9 @@ equal_range(ForwardIterator first,
 //////////////////////
 
 template <typename ForwardIterator, typename InputIterator, typename OutputIterator>
-OutputIterator lower_bound(ForwardIterator first, 
+OutputIterator lower_bound(ForwardIterator first,
                            ForwardIterator last,
-                           InputIterator values_first, 
+                           InputIterator values_first,
                            InputIterator values_last,
                            OutputIterator output)
 {
@@ -380,9 +375,9 @@ OutputIterator lower_bound(ForwardIterator first,
 }
 
 template <typename ForwardIterator, typename InputIterator, typename OutputIterator, typename StrictWeakOrdering>
-OutputIterator lower_bound(ForwardIterator first, 
+OutputIterator lower_bound(ForwardIterator first,
                            ForwardIterator last,
-                           InputIterator values_first, 
+                           InputIterator values_first,
                            InputIterator values_last,
                            OutputIterator output,
                            StrictWeakOrdering comp)
@@ -399,11 +394,11 @@ OutputIterator lower_bound(ForwardIterator first,
 
     return thrust::lower_bound(select_system(system1,system2,system3), first, last, values_first, values_last, output, comp);
 }
-    
+
 template <typename ForwardIterator, typename InputIterator, typename OutputIterator>
-OutputIterator upper_bound(ForwardIterator first, 
+OutputIterator upper_bound(ForwardIterator first,
                            ForwardIterator last,
-                           InputIterator values_first, 
+                           InputIterator values_first,
                            InputIterator values_last,
                            OutputIterator output)
 {
@@ -421,9 +416,9 @@ OutputIterator upper_bound(ForwardIterator first,
 }
 
 template <typename ForwardIterator, typename InputIterator, typename OutputIterator, typename StrictWeakOrdering>
-OutputIterator upper_bound(ForwardIterator first, 
+OutputIterator upper_bound(ForwardIterator first,
                            ForwardIterator last,
-                           InputIterator values_first, 
+                           InputIterator values_first,
                            InputIterator values_last,
                            OutputIterator output,
                            StrictWeakOrdering comp)
@@ -442,9 +437,9 @@ OutputIterator upper_bound(ForwardIterator first,
 }
 
 template <typename ForwardIterator, typename InputIterator, typename OutputIterator>
-OutputIterator binary_search(ForwardIterator first, 
+OutputIterator binary_search(ForwardIterator first,
                              ForwardIterator last,
-                             InputIterator values_first, 
+                             InputIterator values_first,
                              InputIterator values_last,
                              OutputIterator output)
 {
@@ -462,9 +457,9 @@ OutputIterator binary_search(ForwardIterator first,
 }
 
 template <typename ForwardIterator, typename InputIterator, typename OutputIterator, typename StrictWeakOrdering>
-OutputIterator binary_search(ForwardIterator first, 
+OutputIterator binary_search(ForwardIterator first,
                              ForwardIterator last,
-                             InputIterator values_first, 
+                             InputIterator values_first,
                              InputIterator values_last,
                              OutputIterator output,
                              StrictWeakOrdering comp)
@@ -482,5 +477,4 @@ OutputIterator binary_search(ForwardIterator first,
     return thrust::binary_search(select_system(system1,system2,system3), first, last, values_first, values_last, output, comp);
 }
 
-} // end namespace thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/caching_allocator.h b/thrust/detail/caching_allocator.h
new file mode 100644
index 000000000..941f52755
--- /dev/null
+++ b/thrust/detail/caching_allocator.h
@@ -0,0 +1,47 @@
+/*
+ *  Copyright 2020 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/mr/allocator.h>
+#include <thrust/mr/disjoint_tls_pool.h>
+#include <thrust/mr/new.h>
+#include <thrust/mr/device_memory_resource.h>
+
+THRUST_NAMESPACE_BEGIN
+
+namespace detail
+{
+inline
+thrust::mr::allocator<
+    char,
+    thrust::mr::disjoint_unsynchronized_pool_resource<
+        thrust::device_memory_resource,
+        thrust::mr::new_delete_resource
+    >
+> single_device_tls_caching_allocator()
+{
+    return {
+        &thrust::mr::tls_disjoint_pool(
+            thrust::mr::get_global_resource<thrust::device_memory_resource>(),
+            thrust::mr::get_global_resource<thrust::mr::new_delete_resource>()
+        )
+    };
+}
+}
+
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/complex/arithmetic.h b/thrust/detail/complex/arithmetic.h
index 448166e98..518f18450 100644
--- a/thrust/detail/complex/arithmetic.h
+++ b/thrust/detail/complex/arithmetic.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2021 NVIDIA Corporation
  *  Copyright 2013 Filipe RNC Maia
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
@@ -15,13 +15,16 @@
  *  limitations under the License.
  */
 
+#pragma once
+
+#include <thrust/detail/config.h>
+
 #include <thrust/complex.h>
+#include <thrust/detail/complex/c99math.h>
 #include <cfloat>
 #include <cmath>
-#include <thrust/detail/complex/c99math.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
   /* --- Binary Arithmetic Operators --- */
 
@@ -160,14 +163,14 @@ operator/(const T0& x, const complex<T1>& y)
 
 /* --- Unary Arithmetic Operators --- */
 
-template <typename T> 
+template <typename T>
 __host__ __device__
 complex<T> operator+(const complex<T>& y)
 {
   return y;
 }
 
-template <typename T> 
+template <typename T>
 __host__ __device__
 complex<T> operator-(const complex<T>& y)
 {
@@ -187,7 +190,7 @@ T abs(const complex<T>& z)
 
 // XXX Why are we specializing here?
 namespace detail {
-namespace complex {	
+namespace complex {
 
 __host__ __device__
 inline float abs(const thrust::complex<float>& z)
@@ -258,7 +261,7 @@ inline float norm(const complex<float>& z)
     float a = z.real() * 4.0f;
     float b = z.imag() * 4.0f;
     return (a * a + b * b) / 16.0f;
-  } 
+  }
 
   return z.real() * z.real() + z.imag() * z.imag();
 }
@@ -276,7 +279,7 @@ inline double norm(const complex<double>& z)
     double a = z.real() * 4.0;
     double b = z.imag() * 4.0;
     return (a * a + b * b) / 16.0;
-  } 
+  }
 
   return z.real() * z.real() + z.imag() * z.imag();
 }
@@ -286,7 +289,7 @@ template <typename T0, typename T1>
 __host__ __device__
 complex<typename detail::promoted_numerical_type<T0, T1>::type>
 polar(const T0& m, const T1& theta)
-{ 
+{
   typedef typename detail::promoted_numerical_type<T0, T1>::type T;
 
   // Find `cos` and `sin` by ADL.
@@ -296,5 +299,5 @@ polar(const T0& m, const T1& theta)
   return complex<T>(m * cos(theta), m * sin(theta));
 }
 
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/complex/c99math.h b/thrust/detail/complex/c99math.h
index 9c965839d..e735b850c 100644
--- a/thrust/detail/complex/c99math.h
+++ b/thrust/detail/complex/c99math.h
@@ -14,20 +14,22 @@
  *  See the License for the specific language governing permissions and
  *  limitations under the License.
  */
-#pragma once 
+#pragma once
 
+#include <thrust/detail/config.h>
+
+#include <math.h>
 #include <cmath>
 #include <thrust/detail/complex/math_private.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 namespace complex
 {
 
 // Define basic arithmetic functions so we can use them without explicit scope
-// keeping the code as close as possible to FreeBSDs for ease of maintenance. 
+// keeping the code as close as possible to FreeBSDs for ease of maintenance.
 // It also provides an easy way to support compilers with missing C99 functions.
 // When possible, just use the names in the global scope.
 // Some platforms define these as macros, others as free functions.
@@ -83,11 +85,11 @@ __host__ __device__ inline int isnan(double x){
 }
 
 __host__ __device__ inline int signbit(float x){
-  return (*((uint32_t *)&x)) & 0x80000000;
+  return ((*((uint32_t *)&x)) & 0x80000000) != 0 ? 1 : 0;
 }
 
 __host__ __device__ inline int signbit(double x){
-  return (*((uint32_t *)&x)) & 0x80000000;
+  return ((*((uint64_t *)&x)) & 0x8000000000000000) != 0ull ? 1 : 0;
 }
 
 __host__ __device__ inline int isfinite(float x){
@@ -100,35 +102,25 @@ __host__ __device__ inline int isfinite(double x){
 
 #else
 
-#  if defined(__CUDACC__) && !(defined(__CUDA__) && defined(__clang__))
-
-// sometimes the CUDA toolkit provides these these names as macros,
-// sometimes functions in the global scope
-
-#    if (CUDA_VERSION >= 6500)
+#  if defined(__CUDACC__) && !(defined(__CUDA__) && defined(__clang__)) && !defined(_NVHPC_CUDA)
+// NVCC implements at least some signature of these as functions not macros.
 using ::isinf;
 using ::isnan;
 using ::signbit;
 using ::isfinite;
-
-#    else
-// these names are macros, we don't need to define them
-
-#    endif // CUDA_VERSION
-
 #  else
-// Some compilers do not provide these in the global scope
-// they are in std:: instead
+// Some compilers do not provide these in the global scope, because they are
+// supposed to be macros. The versions in `std` are supposed to be functions.
 // Since we're not compiling with nvcc, it's safe to use the functions in std::
 using std::isinf;
 using std::isnan;
 using std::signbit;
 using std::isfinite;
 #  endif // __CUDACC__
+#endif // _MSC_VER
 
 using ::atanh;
-#endif // _MSC_VER
-  
+
 #if defined _MSC_VER
 
 __host__ __device__ inline double copysign(double x, double y){
@@ -149,7 +141,7 @@ __host__ __device__ inline float copysignf(float x, float y){
 
 
 
-#ifndef __CUDACC__
+#if !defined(__CUDACC__) && !defined(_NVHPC_CUDA)
 
 // Simple approximation to log1p as Visual Studio is lacking one
 inline double log1p(double x){
@@ -159,7 +151,7 @@ inline double log1p(double x){
   }else{
     if(u > 2.0){
       // Use normal log for large arguments
-      return log(u); 
+      return log(u);
     }else{
       return log(u)*(x/(u-1.0));
     }
@@ -173,7 +165,7 @@ inline float log1pf(float x){
   }else{
     if(u > 2.0f){
       // Use normal log for large arguments
-      return logf(u); 
+      return logf(u);
     }else{
       return logf(u)*(x/(u-1.0f));
     }
@@ -201,5 +193,5 @@ inline double hypot(double x, double y){
 
 } // namespace detail
 
-} // namespace thrust
-      
+THRUST_NAMESPACE_END
+
diff --git a/thrust/detail/complex/catrig.h b/thrust/detail/complex/catrig.h
index 70adf03ff..4955ec5bf 100644
--- a/thrust/detail/complex/catrig.h
+++ b/thrust/detail/complex/catrig.h
@@ -48,27 +48,28 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
 #include <thrust/complex.h>
 #include <thrust/detail/complex/math_private.h>
 #include <cfloat>
 #include <cmath>
 
-namespace thrust{
+THRUST_NAMESPACE_BEGIN
 namespace detail{
-namespace complex{		      	
+namespace complex{
 
 using thrust::complex;
 
 __host__ __device__
 inline void raise_inexact(){
-  const volatile float tiny = 7.888609052210118054117286e-31; /* 0x1p-100; */ 
+  const volatile float tiny = 7.888609052210118054117286e-31; /* 0x1p-100; */
   // needs the volatile to prevent compiler from ignoring it
   volatile float junk = 1 + tiny;
   (void)junk;
 }
 
 __host__ __device__ inline complex<double> clog_for_large_values(complex<double> z);
-  
+
 /*
  * Testing indicates that all these functions are accurate up to 4 ULP.
  * The functions casin(h) and cacos(h) are about 2.5 times slower than asinh.
@@ -146,7 +147,7 @@ f(double a, double b, double hypot_a_b)
     return (a / 2);
   return (a * a / (hypot_a_b + b) / 2);
 }
-  
+
 /*
  * All the hard work is contained in this function.
  * x and y are assumed positive or zero, and less than RECIP_EPSILON.
@@ -167,10 +168,10 @@ do_hard_work(double x, double y, double *rx, int *B_is_usable, double *B,
   const double A_crossover = 10; /* Hull et al suggest 1.5, but 10 works better */
   const double FOUR_SQRT_MIN = 5.966672584960165394632772e-154; /* =0x1p-509; >= 4 * sqrt(DBL_MIN) */
   const double B_crossover = 0.6417; /* suggested by Hull et al */
-  
+
   R = hypot(x, y + 1);		/* |z+I| */
   S = hypot(x, y - 1);		/* |z-I| */
-  
+
   /* A = (|z+I| + |z-I|) / 2 */
   A = (R + S) / 2;
   /*
@@ -180,7 +181,7 @@ do_hard_work(double x, double y, double *rx, int *B_is_usable, double *B,
    */
   if (A < 1)
     A = 1;
-  
+
   if (A < A_crossover) {
     /*
      * Am1 = fp + fm, where fp = f(x, 1+y), and fm = f(x, 1-y).
@@ -214,9 +215,9 @@ do_hard_work(double x, double y, double *rx, int *B_is_usable, double *B,
   } else {
     *rx = log(A + sqrt(A * A - 1));
   }
-  
+
   *new_y = y;
-  
+
   if (y < FOUR_SQRT_MIN) {
     /*
      * Avoid a possible underflow caused by y/A.  For casinh this
@@ -228,11 +229,11 @@ do_hard_work(double x, double y, double *rx, int *B_is_usable, double *B,
     *new_y = y * (2 / DBL_EPSILON);
     return;
   }
-  
+
   /* B = (|z+I| - |z-I|) / 2 = y/A */
   *B = y / A;
   *B_is_usable = 1;
-  
+
   if (*B > B_crossover) {
     *B_is_usable = 0;
     /*
@@ -274,7 +275,7 @@ do_hard_work(double x, double y, double *rx, int *B_is_usable, double *B,
     }
   }
 }
-  
+
 /*
  * casinh(z) = z + O(z^3)   as z -> 0
  *
@@ -295,7 +296,7 @@ complex<double> casinh(complex<double> z)
   y = z.imag();
   ax = fabs(x);
   ay = fabs(y);
-  
+
   if (isnan(x) || isnan(y)) {
     /* casinh(+-Inf + I*NaN) = +-Inf + I*NaN */
     if (isinf(x))
@@ -350,10 +351,10 @@ __host__ __device__ inline
 complex<double> casin(complex<double> z)
 {
   complex<double> w = casinh(complex<double>(z.imag(), z.real()));
-  
+
   return (complex<double>(w.imag(), w.real()));
 }
-  
+
 /*
  * cacos(z) = PI/2 - casin(z)
  * but do the computation carefully so cacos(z) is accurate when z is
@@ -450,7 +451,7 @@ complex<double> cacosh(complex<double> z)
 {
   complex<double> w;
   double rx, ry;
-  
+
   w = cacos(z);
   rx = w.real();
   ry = w.imag();
@@ -476,7 +477,7 @@ complex<double> clog_for_large_values(complex<double> z)
   double x, y;
   double ax, ay, t;
   const double m_e = 2.7182818284590452e0; /*  0x15bf0a8b145769.0p-51 */
-  
+
   x = z.real();
   y = z.imag();
   ax = fabs(x);
@@ -486,7 +487,7 @@ complex<double> clog_for_large_values(complex<double> z)
     ax = ay;
     ay = t;
   }
-  
+
   /*
    * Avoid overflow in hypot() when x and y are both very large.
    * Divide x and y by E, and then add 1 to the logarithm.  This depends
@@ -496,7 +497,7 @@ complex<double> clog_for_large_values(complex<double> z)
    */
   if (ax > DBL_MAX / 2)
     return (complex<double>(log(hypot(x / m_e, y / m_e)) + 1, atan2(y, x)));
-  
+
   /*
    * Avoid overflow when x or y is large.  Avoid underflow when x or
    * y is small.
@@ -505,16 +506,16 @@ complex<double> clog_for_large_values(complex<double> z)
   const double SQRT_MIN =	1.491668146240041348658193e-154; /* = 0x1p-511; >= sqrt(DBL_MIN) */
   if (ax > QUARTER_SQRT_MAX || ay < SQRT_MIN)
     return (complex<double>(log(hypot(x, y)), atan2(y, x)));
-  
+
   return (complex<double>(log(ax * ax + ay * ay) / 2, atan2(y, x)));
 }
-  
+
 /*
  *				=================
  *				| catanh, catan |
  *				=================
  */
-  
+
 /*
    * sum_squares(x,y) = x*x + y*y (or just x*x if y*y would underflow).
    * Assumes x*x and y*y will not overflow.
@@ -529,10 +530,10 @@ inline double sum_squares(double x, double y)
   /* Avoid underflow when y is small. */
   if (y < SQRT_MIN)
     return (x * x);
-  
+
   return (x * x + y * y);
 }
-  
+
 /*
  * real_part_reciprocal(x, y) = Re(1/(x+I*y)) = x/(x*x + y*y).
  * Assumes x and y are not NaN, and one of x and y is larger than
@@ -548,7 +549,7 @@ inline double real_part_reciprocal(double x, double y)
   double scale;
   uint32_t hx, hy;
   int32_t ix, iy;
-  
+
   /*
    * This code is inspired by the C99 document n1124.pdf, Section G.5.1,
    * example 2.
@@ -574,8 +575,8 @@ inline double real_part_reciprocal(double x, double y)
   y *= scale;
   return (x / (x * x + y * y) * scale);
 }
-  
-  
+
+
 /*
  * catanh(z) = log((1+z)/(1-z)) / 2
  *           = log1p(4*x / |z-1|^2) / 4
@@ -588,28 +589,28 @@ inline double real_part_reciprocal(double x, double y)
  * Re(catanh(z)) = x/|z|^2 + O(x/z^4)
  *    as z -> infinity, uniformly in x
  */
-#if __cplusplus >= 201103L || !defined _MSC_VER
+#if THRUST_CPP_DIALECT >= 2011 || THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC
 __host__ __device__ inline
 complex<double> catanh(complex<double> z)
 {
   double x, y, ax, ay, rx, ry;
   const volatile double pio2_lo = 6.1232339957367659e-17; /*  0x11a62633145c07.0p-106 */
   const double pio2_hi = 1.5707963267948966e0;/*  0x1921fb54442d18.0p-52 */
-  
-  
+
+
   x = z.real();
   y = z.imag();
   ax = fabs(x);
   ay = fabs(y);
-  
+
   /* This helps handle many cases. */
   if (y == 0 && ax <= 1)
     return (complex<double>(atanh(x), y));
-  
+
   /* To ensure the same accuracy as atan(), and to filter out z = 0. */
   if (x == 0)
     return (complex<double>(x, atan(y)));
-  
+
   if (isnan(x) || isnan(y)) {
     /* catanh(+-Inf + I*NaN) = +-0 + I*NaN */
     if (isinf(x))
@@ -625,12 +626,12 @@ complex<double> catanh(complex<double> z)
      */
     return (complex<double>(x + 0.0 + (y + 0), x + 0.0 + (y + 0)));
   }
-  
+
   const double RECIP_EPSILON = 1.0 / DBL_EPSILON;
   if (ax > RECIP_EPSILON || ay > RECIP_EPSILON)
     return (complex<double>(real_part_reciprocal(x, y),
 			    copysign(pio2_hi + pio2_lo, y)));
-  
+
   const double SQRT_3_EPSILON = 2.5809568279517849e-8; /*  0x1bb67ae8584caa.0p-78 */
   if (ax < SQRT_3_EPSILON / 2 && ay < SQRT_3_EPSILON / 2) {
     /*
@@ -641,23 +642,23 @@ complex<double> catanh(complex<double> z)
     raise_inexact();
     return (z);
   }
-  
+
   const double m_ln2 = 6.9314718055994531e-1; /*  0x162e42fefa39ef.0p-53 */
   if (ax == 1 && ay < DBL_EPSILON)
     rx = (m_ln2 - log(ay)) / 2;
   else
     rx = log1p(4 * ax / sum_squares(ax - 1, ay)) / 4;
-  
+
   if (ax == 1)
     ry = atan2(2.0, -ay) / 2;
   else if (ay < DBL_EPSILON)
     ry = atan2(2 * ay, (1 - ax) * (1 + ax)) / 2;
   else
     ry = atan2(2 * ay, (1 - ax) * (1 + ax) - ay * ay) / 2;
-  
+
   return (complex<double>(copysign(rx, x), copysign(ry, y)));
 }
-  
+
 /*
  * catan(z) = reverse(catanh(reverse(z)))
  * where reverse(x + I*y) = y + I*x = I*conj(z).
@@ -691,20 +692,20 @@ inline complex<ValueType> asin(const complex<ValueType>& z){
   const complex<ValueType> i(0,1);
   return -i*asinh(i*z);
 }
-  
+
 template <typename ValueType>
 __host__ __device__
 inline complex<ValueType> atan(const complex<ValueType>& z){
   const complex<ValueType> i(0,1);
   return -i*thrust::atanh(i*z);
 }
-  
+
 
 template <typename ValueType>
 __host__ __device__
 inline complex<ValueType> acosh(const complex<ValueType>& z){
   thrust::complex<ValueType> ret((z.real() - z.imag()) * (z.real() + z.imag()) - ValueType(1.0),
-				 ValueType(2.0) * z.real() * z.imag());    
+				 ValueType(2.0) * z.real() * z.imag());
   ret = thrust::sqrt(ret);
   if (z.real() < ValueType(0.0)){
     ret = -ret;
@@ -716,43 +717,43 @@ inline complex<ValueType> acosh(const complex<ValueType>& z){
   }
   return ret;
 }
-  
+
 template <typename ValueType>
 __host__ __device__
 inline complex<ValueType> asinh(const complex<ValueType>& z){
   return thrust::log(thrust::sqrt(z*z+ValueType(1))+z);
 }
-  
+
 template <typename ValueType>
 __host__ __device__
 inline complex<ValueType> atanh(const complex<ValueType>& z){
-  ValueType imag2 = z.imag() *  z.imag();   
+  ValueType imag2 = z.imag() *  z.imag();
   ValueType n = ValueType(1.0) + z.real();
   n = imag2 + n * n;
-  
+
   ValueType d = ValueType(1.0) - z.real();
   d = imag2 + d * d;
   complex<ValueType> ret(ValueType(0.25) * (std::log(n) - std::log(d)),0);
-  
+
   d = ValueType(1.0) -  z.real() * z.real() - imag2;
-  
+
   ret.imag(ValueType(0.5) * std::atan2(ValueType(2.0) * z.imag(), d));
   return ret;
 }
-  
+
 template <>
 __host__ __device__
 inline complex<double> acos(const complex<double>& z){
   return detail::complex::cacos(z);
 }
-  
+
 template <>
 __host__ __device__
 inline complex<double> asin(const complex<double>& z){
   return detail::complex::casin(z);
 }
-  
-#if __cplusplus >= 201103L || !defined _MSC_VER
+
+#if THRUST_CPP_DIALECT >= 2011 || THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC
 template <>
 __host__ __device__
 inline complex<double> atan(const complex<double>& z){
@@ -772,8 +773,8 @@ __host__ __device__
 inline complex<double> asinh(const complex<double>& z){
   return detail::complex::casinh(z);
 }
-  
-#if __cplusplus >= 201103L || !defined _MSC_VER
+
+#if THRUST_CPP_DIALECT >= 2011 || THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC
 template <>
 __host__ __device__
 inline complex<double> atanh(const complex<double>& z){
@@ -781,4 +782,4 @@ inline complex<double> atanh(const complex<double>& z){
 }
 #endif
 
-} // namespace thrust
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/complex/catrigf.h b/thrust/detail/complex/catrigf.h
index db04c466a..c06791311 100644
--- a/thrust/detail/complex/catrigf.h
+++ b/thrust/detail/complex/catrigf.h
@@ -50,10 +50,11 @@
 
 #include <thrust/complex.h>
 #include <thrust/detail/complex/math_private.h>
+#include <thrust/detail/config.h>
 #include <cfloat>
 #include <cmath>
 
-namespace thrust{
+THRUST_NAMESPACE_BEGIN
 namespace detail{
 namespace complex{		      	
 
@@ -166,7 +167,7 @@ casinhf(complex<float> z)
   float x, y, ax, ay, rx, ry, B, sqrt_A2my2, new_y;
   int B_is_usable;
   complex<float> w;
-  const float RECIP_EPSILON = 1.0 / FLT_EPSILON;
+  const float RECIP_EPSILON = 1.0f / FLT_EPSILON;
   const float m_ln2 = 6.9314718055994531e-1f; /*  0x162e42fefa39ef.0p-53 */
   x = z.real();
   y = z.imag();
@@ -245,7 +246,7 @@ complex<float> cacosf(complex<float> z)
     return (complex<float>(x + 0.0f + (y + 0), x + 0.0f + (y + 0)));
   }
 
-  const float RECIP_EPSILON = 1.0 / FLT_EPSILON;
+  const float RECIP_EPSILON = 1.0f / FLT_EPSILON;
   if (ax > RECIP_EPSILON || ay > RECIP_EPSILON) {
     w = clog_for_large_values(z);
     rx = fabsf(w.imag());
@@ -386,13 +387,13 @@ inline float real_part_reciprocal(float x, float y)
   return (x / (x * x + y * y) * scale);
 }
 
-#if __cplusplus >= 201103L || !defined _MSC_VER
+#if THRUST_CPP_DIALECT >= 2011 || THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC
 __host__ __device__ inline
 complex<float> catanhf(complex<float> z)
 {
   float x, y, ax, ay, rx, ry;
-  const volatile float pio2_lo = 6.1232339957367659e-17; /*  0x11a62633145c07.0p-106 */
-  const float pio2_hi = 1.5707963267948966e0;/*  0x1921fb54442d18.0p-52 */
+  const volatile float pio2_lo = 6.1232339957367659e-17f; /*  0x11a62633145c07.0p-106 */
+  const float pio2_hi = 1.5707963267948966e0f;/*  0x1921fb54442d18.0p-52 */
 
 
   x = z.real();
@@ -421,7 +422,7 @@ complex<float> catanhf(complex<float> z)
     return (complex<float>(real_part_reciprocal(x, y),
 			   copysignf(pio2_hi + pio2_lo, y)));
 
-  const float SQRT_3_EPSILON = 5.9801995673e-4; /*  0x9cc471.0p-34 */
+  const float SQRT_3_EPSILON = 5.9801995673e-4f; /*  0x9cc471.0p-34 */
   if (ax < SQRT_3_EPSILON / 2 && ay < SQRT_3_EPSILON / 2) {
     raise_inexact();
     return (z);
@@ -467,7 +468,7 @@ inline complex<float> asin(const complex<float>& z){
   return detail::complex::casinf(z);
 }
 
-#if __cplusplus >= 201103L || !defined _MSC_VER
+#if THRUST_CPP_DIALECT >= 2011 || THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC
 template <>
 __host__ __device__
 inline complex<float> atan(const complex<float>& z){
@@ -488,7 +489,7 @@ inline complex<float> asinh(const complex<float>& z){
   return detail::complex::casinhf(z);
 }
 
-#if __cplusplus >= 201103L || !defined _MSC_VER
+#if THRUST_CPP_DIALECT >= 2011 || THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC
 template <>
 __host__ __device__
 inline complex<float> atanh(const complex<float>& z){
@@ -496,4 +497,4 @@ inline complex<float> atanh(const complex<float>& z){
 }
 #endif
 
-} // namespace thrust
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/complex/ccosh.h b/thrust/detail/complex/ccosh.h
index 300f08afc..722dfcd84 100644
--- a/thrust/detail/complex/ccosh.h
+++ b/thrust/detail/complex/ccosh.h
@@ -47,10 +47,12 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/complex.h>
 #include <thrust/detail/complex/math_private.h>
 
-namespace thrust{
+THRUST_NAMESPACE_BEGIN
 namespace detail{
 namespace complex{		      	
 
@@ -210,4 +212,4 @@ inline thrust::complex<double> cosh(const thrust::complex<double>& z){
   return detail::complex::ccosh(z);
 }
 
-} // namespace thrust
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/complex/ccoshf.h b/thrust/detail/complex/ccoshf.h
index d33af7c4c..aa43f1208 100644
--- a/thrust/detail/complex/ccoshf.h
+++ b/thrust/detail/complex/ccoshf.h
@@ -48,10 +48,12 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/complex.h>
 #include <thrust/detail/complex/math_private.h>
 
-namespace thrust{
+THRUST_NAMESPACE_BEGIN
 namespace detail{
 namespace complex{		      	
 
@@ -138,4 +140,4 @@ inline complex<float> cosh(const complex<float>& z){
   return detail::complex::ccoshf(z);
 }
   
-} // namespace thrust
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/complex/cexp.h b/thrust/detail/complex/cexp.h
index 151df397b..c0c8c07d2 100644
--- a/thrust/detail/complex/cexp.h
+++ b/thrust/detail/complex/cexp.h
@@ -49,10 +49,12 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/complex.h>
 #include <thrust/detail/complex/math_private.h>
 
-namespace thrust{
+THRUST_NAMESPACE_BEGIN
 namespace detail{
 namespace complex{		      	
 /*
@@ -180,4 +182,4 @@ inline complex<double> exp(const complex<double>& z){
   return detail::complex::cexp(z);
 }
 
-} // namespace thrust
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/complex/cexpf.h b/thrust/detail/complex/cexpf.h
index 6d85c45ed..cae030fe7 100644
--- a/thrust/detail/complex/cexpf.h
+++ b/thrust/detail/complex/cexpf.h
@@ -49,10 +49,12 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/complex.h>
 #include <thrust/detail/complex/math_private.h>
 
-namespace thrust{
+THRUST_NAMESPACE_BEGIN
 namespace detail{
 namespace complex{
 
@@ -158,4 +160,4 @@ inline complex<float> exp(const complex<float>& z){
   return detail::complex::cexpf(z);
 }    
   
-} // namespace thrust
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/complex/clog.h b/thrust/detail/complex/clog.h
index 8d288df02..b727121c3 100644
--- a/thrust/detail/complex/clog.h
+++ b/thrust/detail/complex/clog.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2021 NVIDIA Corporation
  *  Copyright 2013 Filipe RNC Maia
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
@@ -46,10 +46,12 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/complex.h>
 #include <thrust/detail/complex/math_private.h>
 
-namespace thrust{
+THRUST_NAMESPACE_BEGIN
 namespace detail{
 namespace complex{
 
@@ -58,7 +60,7 @@ using thrust::complex;
 /* round down to 18 = 54/3 bits */
 __host__ __device__ inline
 double trim(double x){
-  uint32_t hi;    
+  uint32_t hi;
   get_high_word(hi, x);
   insert_words(x, hi &0xfffffff8, 0);
   return x;
@@ -120,7 +122,7 @@ complex<double> clog(const complex<double>& z){
     return (complex<double>(std::log(hypot(x, y)), std::atan2(y, x)));
   }
 
-  /* 
+  /*
    * From this point on, we don't need to worry about underflow or
    * overflow in calculating ax*ax or ay*ay.
    */
@@ -183,7 +185,7 @@ complex<double> clog(const complex<double>& z){
   }
   return (complex<double>(0.5 * log1p(hm1), atan2(y, x)));
 }
-  
+
 } // namespace complex
 
 } // namespace detail
@@ -202,11 +204,11 @@ inline complex<double> log(const complex<double>& z){
 
 template <typename ValueType>
 __host__ __device__
-inline complex<ValueType> log10(const complex<ValueType>& z){ 
+inline complex<ValueType> log10(const complex<ValueType>& z){
   // Using the explicit literal prevents compile time warnings in
-  // devices that don't support doubles 
+  // devices that don't support doubles
   return thrust::log(z)/ValueType(2.30258509299404568402);
 }
 
-} // namespace thrust
-    
+THRUST_NAMESPACE_END
+
diff --git a/thrust/detail/complex/clogf.h b/thrust/detail/complex/clogf.h
index 7f3314ed2..c72370c42 100644
--- a/thrust/detail/complex/clogf.h
+++ b/thrust/detail/complex/clogf.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2021 NVIDIA Corporation
  *  Copyright 2013 Filipe RNC Maia
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
@@ -45,10 +45,12 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/complex.h>
 #include <thrust/detail/complex/math_private.h>
 
-namespace thrust{
+THRUST_NAMESPACE_BEGIN
 namespace detail{
 namespace complex{
 
@@ -74,7 +76,7 @@ complex<float> clogf(const complex<float>& z){
   float ax, ay;
   float x0, y0, x1, y1, x2, y2, t, hm1;
   float val[12];
-  int i, sorted;	
+  int i, sorted;
   const float e = 2.7182818284590452354f;
 
   x = z.real();
@@ -102,7 +104,7 @@ complex<float> clogf(const complex<float>& z){
    */
   // For high values of ay -> hypotf(FLT_MAX,ay) = inf
   // We expect that for values at or below ay = 1e34f this should not happen
-  if (ay > 1e34f){ 
+  if (ay > 1e34f){
     return (complex<float>(std::log(hypotf(x / e, y / e)) + 1.0f, std::atan2(y, x)));
   }
   if (ax == 1.f) {
@@ -120,7 +122,7 @@ complex<float> clogf(const complex<float>& z){
     return (complex<float>(std::log(hypotf(x, y)), std::atan2(y, x)));
   }
 
-  /* 
+  /*
    * From this point on, we don't need to worry about underflow or
    * overflow in calculating ax*ax or ay*ay.
    */
@@ -194,5 +196,5 @@ inline complex<float> log(const complex<float>& z){
   return detail::complex::clogf(z);
 }
 
-} // namespace thrust
-    
+THRUST_NAMESPACE_END
+
diff --git a/thrust/detail/complex/complex.inl b/thrust/detail/complex/complex.inl
index 2e2a106bc..a00b81a4b 100644
--- a/thrust/detail/complex/complex.inl
+++ b/thrust/detail/complex/complex.inl
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2021 NVIDIA Corporation
  *  Copyright 2013 Filipe RNC Maia
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
@@ -15,12 +15,14 @@
  *  limitations under the License.
  */
 
-#include <thrust/complex.h>
+#pragma once
+
+#include <thrust/detail/config.h>
 
+#include <thrust/complex.h>
 #include <thrust/type_traits/is_trivially_relocatable.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 /* --- Constructors --- */
 
@@ -330,7 +332,7 @@ bool operator!=(const complex<T0>& x, const T1& y)
 template <typename T>
 struct proclaim_trivially_relocatable<complex<T> > : thrust::true_type {};
 
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/complex/arithmetic.h>
 #include <thrust/detail/complex/cproj.h>
diff --git a/thrust/detail/complex/cpow.h b/thrust/detail/complex/cpow.h
index 2d6ad051e..c204c451f 100644
--- a/thrust/detail/complex/cpow.h
+++ b/thrust/detail/complex/cpow.h
@@ -17,10 +17,12 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/complex.h>
 #include <thrust/detail/type_traits.h>
 
-namespace thrust {
+THRUST_NAMESPACE_BEGIN
 
 template <typename T0, typename T1>
 __host__ __device__
@@ -51,5 +53,5 @@ pow(const T0& x, const complex<T1>& y)
   return exp(log(T(x)) * complex<T>(y));
 }
 
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/complex/cproj.h b/thrust/detail/complex/cproj.h
index 563c92f69..7537c99fd 100644
--- a/thrust/detail/complex/cproj.h
+++ b/thrust/detail/complex/cproj.h
@@ -17,11 +17,13 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/complex.h>
 #include <thrust/detail/complex/math_private.h>
 #include <cmath>
 
-namespace thrust{
+THRUST_NAMESPACE_BEGIN
 namespace detail{
 namespace complex{	 
 __host__ __device__
@@ -67,5 +69,4 @@ inline thrust::complex<float> proj(const thrust::complex<float>& z){
   return detail::complex::cprojf(z);
 }
 
-}
-
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/complex/csinh.h b/thrust/detail/complex/csinh.h
index 869f367f2..b5a22af01 100644
--- a/thrust/detail/complex/csinh.h
+++ b/thrust/detail/complex/csinh.h
@@ -48,10 +48,12 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/complex.h>
 #include <thrust/detail/complex/math_private.h>
 
-namespace thrust{
+THRUST_NAMESPACE_BEGIN
 namespace detail{
 namespace complex{		      	
 
@@ -202,4 +204,4 @@ inline complex<double> sinh(const complex<double>& z){
   return detail::complex::csinh(z);
 }
 
-} // namespace thrust
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/complex/csinhf.h b/thrust/detail/complex/csinhf.h
index bf4fb0816..d271081c6 100644
--- a/thrust/detail/complex/csinhf.h
+++ b/thrust/detail/complex/csinhf.h
@@ -48,10 +48,12 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/complex.h>
 #include <thrust/detail/complex/math_private.h>
 
-namespace thrust{
+THRUST_NAMESPACE_BEGIN
 namespace detail{
 namespace complex{		      	
 
@@ -139,4 +141,4 @@ inline complex<float> sinh(const complex<float>& z){
   return detail::complex::csinhf(z);
 }
 
-} // namespace thrust
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/complex/csqrt.h b/thrust/detail/complex/csqrt.h
index dcffbee95..eb4da5289 100644
--- a/thrust/detail/complex/csqrt.h
+++ b/thrust/detail/complex/csqrt.h
@@ -49,11 +49,13 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/complex.h>
 #include <thrust/detail/complex/math_private.h>
 #include <cmath>
 
-namespace thrust{
+THRUST_NAMESPACE_BEGIN
 namespace detail{
 namespace complex{		      	
 
@@ -149,4 +151,4 @@ inline complex<double> sqrt(const complex<double>& z){
   return detail::complex::csqrt(z);
 }
 
-} // namespace thrust
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/complex/csqrtf.h b/thrust/detail/complex/csqrtf.h
index 125d4b60d..dba489a33 100644
--- a/thrust/detail/complex/csqrtf.h
+++ b/thrust/detail/complex/csqrtf.h
@@ -49,11 +49,13 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/complex.h>
 #include <thrust/detail/complex/math_private.h>
 #include <cmath>
 
-namespace thrust{
+THRUST_NAMESPACE_BEGIN
 namespace detail{
 namespace complex{		      	
 
@@ -144,4 +146,4 @@ inline complex<float> sqrt(const complex<float>& z){
   return detail::complex::csqrtf(z);
 }
 
-} // namespace thrust
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/complex/ctanh.h b/thrust/detail/complex/ctanh.h
index 6ef159092..3275c0343 100644
--- a/thrust/detail/complex/ctanh.h
+++ b/thrust/detail/complex/ctanh.h
@@ -87,11 +87,13 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/complex.h>
 #include <thrust/detail/complex/math_private.h>
 #include <cmath>
 
-namespace thrust{
+THRUST_NAMESPACE_BEGIN
 namespace detail{
 namespace complex{		      	
 
@@ -197,4 +199,4 @@ inline complex<double> tanh(const complex<double>& z){
   return detail::complex::ctanh(z);
 }
   
-} // namespace thrust
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/complex/ctanhf.h b/thrust/detail/complex/ctanhf.h
index f6923d1df..221b5ce47 100644
--- a/thrust/detail/complex/ctanhf.h
+++ b/thrust/detail/complex/ctanhf.h
@@ -52,11 +52,13 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/complex.h>
 #include <thrust/detail/complex/math_private.h>
 #include <cmath>
 
-namespace thrust{
+THRUST_NAMESPACE_BEGIN
 namespace detail{
 namespace complex{		      	
 
@@ -121,4 +123,4 @@ inline complex<float> tanh(const complex<float>& z){
   return detail::complex::ctanhf(z);
 }
 
-} // namespace thrust
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/complex/math_private.h b/thrust/detail/complex/math_private.h
index bc2d6357f..3a40c8e72 100644
--- a/thrust/detail/complex/math_private.h
+++ b/thrust/detail/complex/math_private.h
@@ -35,7 +35,7 @@
 #include <thrust/complex.h>
 #include <thrust/detail/cstdint.h>
 
-namespace thrust{
+THRUST_NAMESPACE_BEGIN
 namespace detail{
 namespace complex{
 
@@ -130,7 +130,7 @@ void  extract_words(int32_t & ix0,int32_t & ix1, double d){
 
 } // namespace detail
 
-} // namespace thrust
+THRUST_NAMESPACE_END
 
 
 #include <thrust/detail/complex/c99math.h>
diff --git a/thrust/detail/complex/stream.h b/thrust/detail/complex/stream.h
index 9d87bbd54..95434b41b 100644
--- a/thrust/detail/complex/stream.h
+++ b/thrust/detail/complex/stream.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2021 NVIDIA Corporation
  *  Copyright 2013 Filipe RNC Maia
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
@@ -15,26 +15,29 @@
  *  limitations under the License.
  */
 
+#pragma once
+
+#include <thrust/detail/config.h>
+
 #include <thrust/complex.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 template<typename ValueType,class charT, class traits>
 std::basic_ostream<charT, traits>& operator<<(std::basic_ostream<charT, traits>& os, const complex<ValueType>& z)
 {
   os << '(' << z.real() << ',' << z.imag() << ')';
   return os;
 }
-  
+
 template<typename ValueType, typename charT, class traits>
 std::basic_istream<charT, traits>&
 operator>>(std::basic_istream<charT, traits>& is, complex<ValueType>& z)
 {
   ValueType re, im;
-    
+
   charT ch;
   is >> ch;
-    
+
   if(ch == '(')
     {
       is >> re >> ch;
@@ -68,4 +71,4 @@ operator>>(std::basic_istream<charT, traits>& is, complex<ValueType>& z)
   return is;
 }
 
-} // namespace thrust
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/config/compiler.h b/thrust/detail/config/compiler.h
index c26f03890..e35652f6a 100644
--- a/thrust/detail/config/compiler.h
+++ b/thrust/detail/config/compiler.h
@@ -25,18 +25,23 @@
 #define THRUST_HOST_COMPILER_MSVC    1
 #define THRUST_HOST_COMPILER_GCC     2
 #define THRUST_HOST_COMPILER_CLANG   3
+#define THRUST_HOST_COMPILER_INTEL   4
 
 // enumerate device compilers we know about
 #define THRUST_DEVICE_COMPILER_UNKNOWN 0
 #define THRUST_DEVICE_COMPILER_MSVC    1
 #define THRUST_DEVICE_COMPILER_GCC     2
-#define THRUST_DEVICE_COMPILER_NVCC    3
-#define THRUST_DEVICE_COMPILER_CLANG   4
+#define THRUST_DEVICE_COMPILER_CLANG   3
+#define THRUST_DEVICE_COMPILER_NVCC    4
 
 // figure out which host compiler we're using
 // XXX we should move the definition of THRUST_DEPRECATED out of this logic
 #if   defined(_MSC_VER)
 #define THRUST_HOST_COMPILER THRUST_HOST_COMPILER_MSVC
+#define THRUST_MSVC_VERSION _MSC_VER
+#define THRUST_MSVC_VERSION_FULL _MSC_FULL_VER
+#elif defined(__ICC)
+#define THRUST_HOST_COMPILER THRUST_HOST_COMPILER_INTEL
 #elif defined(__clang__)
 #define THRUST_HOST_COMPILER THRUST_HOST_COMPILER_CLANG
 #define THRUST_CLANG_VERSION (__clang_major__ * 10000 + __clang_minor__ * 100 + __clang_patchlevel__)
@@ -53,7 +58,7 @@
 #endif // THRUST_HOST_COMPILER
 
 // figure out which device compiler we're using
-#if defined(__CUDACC__)
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
 #define THRUST_DEVICE_COMPILER THRUST_DEVICE_COMPILER_NVCC
 #elif THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC
 #define THRUST_DEVICE_COMPILER THRUST_DEVICE_COMPILER_MSVC
@@ -71,7 +76,7 @@
 #endif
 
 // is the device compiler capable of compiling omp?
-#ifdef _OPENMP
+#if defined(_OPENMP) || defined(_NVHPC_STDPAR_OPENMP)
 #define THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE THRUST_TRUE
 #else
 #define THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE THRUST_FALSE
@@ -181,14 +186,4 @@
   THRUST_DISABLE_CLANG_AND_GCC_INITIALIZER_REORDERING_WARNING_END             \
   /**/
 
-// TODO we should move the definition of THRUST_DEPRECATED out of this logic
-#if   THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC
-  #define THRUST_DEPRECATED __declspec(deprecated)
-#elif THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_CLANG
-  #define THRUST_DEPRECATED __attribute__((deprecated))
-#elif THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC
-  #define THRUST_DEPRECATED __attribute__((deprecated))
-#else
-  #define THRUST_DEPRECATED
-#endif
 
diff --git a/thrust/detail/config/config.h b/thrust/detail/config/config.h
index 41a293a80..797f6605b 100644
--- a/thrust/detail/config/config.h
+++ b/thrust/detail/config/config.h
@@ -26,6 +26,7 @@
 #include <thrust/detail/config/compiler.h>
 #include <thrust/detail/config/cpp_dialect.h>
 #include <thrust/detail/config/cpp_compatibility.h>
+#include <thrust/detail/config/deprecated.h>
 // host_system.h & device_system.h must be #included as early as possible
 // because other config headers depend on it
 #include <thrust/detail/config/host_system.h>
@@ -35,4 +36,5 @@
 #include <thrust/detail/config/forceinline.h>
 #include <thrust/detail/config/exec_check_disable.h>
 #include <thrust/detail/config/global_workarounds.h>
+#include <thrust/detail/config/namespace.h>
 
diff --git a/thrust/detail/config/cpp_compatibility.h b/thrust/detail/config/cpp_compatibility.h
index 5d48d6152..18b9cbdcf 100644
--- a/thrust/detail/config/cpp_compatibility.h
+++ b/thrust/detail/config/cpp_compatibility.h
@@ -20,42 +20,40 @@
 
 #include <cstddef>
 
-#if THRUST_CPP_DIALECT >= 2011
-#  ifndef __has_cpp_attribute
-#    define __has_cpp_attribute(X) 0
-#  endif
-
-#  if __has_cpp_attribute(nodiscard)
-#    define THRUST_NODISCARD [[nodiscard]]
-#  endif
+#ifndef __has_cpp_attribute
+#  define __has_cpp_attribute(X) 0
+#endif
 
-#  define THRUST_CONSTEXPR constexpr
-#  define THRUST_OVERRIDE override
-#  define THRUST_DEFAULT = default;
-#  define THRUST_NOEXCEPT noexcept
-#  define THRUST_FINAL final
+// Trailing return types seem to confuse Doxygen, and cause it to interpret
+// parts of the function's body as new function signatures.
+#if defined(THRUST_DOXYGEN)
+#  define THRUST_TRAILING_RETURN(...)
 #else
-#  define THRUST_CONSTEXPR
-#  define THRUST_OVERRIDE
-#  define THRUST_DEFAULT {}
-#  define THRUST_NOEXCEPT throw()
-#  define THRUST_FINAL
+#  define THRUST_TRAILING_RETURN(...) -> __VA_ARGS__
 #endif
 
-#ifndef THRUST_NODISCARD
+#if THRUST_CPP_DIALECT >= 2014 && __has_cpp_attribute(nodiscard)
+#  define THRUST_NODISCARD [[nodiscard]]
+#else
 #  define THRUST_NODISCARD
 #endif
 
+#if THRUST_CPP_DIALECT >= 2017 && __cpp_if_constexpr
+#  define THRUST_IF_CONSTEXPR if constexpr
+#else
+#  define THRUST_IF_CONSTEXPR if
+#endif
+
 // FIXME: Combine THRUST_INLINE_CONSTANT and
 // THRUST_INLINE_INTEGRAL_MEMBER_CONSTANT into one macro when NVCC properly
 // supports `constexpr` globals in host and device code.
-#ifdef __CUDA_ARCH__
+#if defined(__CUDA_ARCH__) || defined(_NVHPC_CUDA)
 // FIXME: Add this when NVCC supports inline variables.
 //#  if   THRUST_CPP_DIALECT >= 2017
 //#    define THRUST_INLINE_CONSTANT                 inline constexpr
 //#    define THRUST_INLINE_INTEGRAL_MEMBER_CONSTANT inline constexpr
 #  if THRUST_CPP_DIALECT >= 2011
-#    define THRUST_INLINE_CONSTANT                 static constexpr
+#    define THRUST_INLINE_CONSTANT                 static const __device__
 #    define THRUST_INLINE_INTEGRAL_MEMBER_CONSTANT static constexpr
 #  else
 #    define THRUST_INLINE_CONSTANT                 static const __device__
@@ -75,3 +73,29 @@
 #  endif
 #endif
 
+// These definitions were intended for internal use only and are now obsolete.
+// If you relied on them, consider porting your code to use the functionality
+// in libcu++'s <nv/target> header.
+// For a temporary workaround, define THRUST_PROVIDE_LEGACY_ARCH_MACROS to make
+// them available again. These should be considered deprecated and will be
+// fully removed in a future version.
+#ifdef THRUST_PROVIDE_LEGACY_ARCH_MACROS
+  #ifndef THRUST_IS_DEVICE_CODE
+    #if defined(_NVHPC_CUDA)
+      #define THRUST_IS_DEVICE_CODE __builtin_is_device_code()
+      #define THRUST_IS_HOST_CODE (!__builtin_is_device_code())
+      #define THRUST_INCLUDE_DEVICE_CODE 1
+      #define THRUST_INCLUDE_HOST_CODE 1
+    #elif defined(__CUDA_ARCH__)
+      #define THRUST_IS_DEVICE_CODE 1
+      #define THRUST_IS_HOST_CODE 0
+      #define THRUST_INCLUDE_DEVICE_CODE 1
+      #define THRUST_INCLUDE_HOST_CODE 0
+    #else
+      #define THRUST_IS_DEVICE_CODE 0
+      #define THRUST_IS_HOST_CODE 1
+      #define THRUST_INCLUDE_DEVICE_CODE 0
+      #define THRUST_INCLUDE_HOST_CODE 1
+    #endif
+  #endif
+#endif // THRUST_PROVIDE_LEGACY_ARCH_MACROS
diff --git a/thrust/detail/config/cpp_dialect.h b/thrust/detail/config/cpp_dialect.h
index 06cc3f2f1..46b0caec7 100644
--- a/thrust/detail/config/cpp_dialect.h
+++ b/thrust/detail/config/cpp_dialect.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2018 NVIDIA Corporation
+ *  Copyright 2020 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -14,19 +14,127 @@
  *  limitations under the License.
  */
 
+/*! \file cpp_dialect.h
+ *  \brief Detect the version of the C++ standard used by the compiler.
+ */
+
 #pragma once
 
-#if   __cplusplus < 201103L
-  #define THRUST_CPP03
-  #define THRUST_CPP_DIALECT 2003
-#elif __cplusplus < 201402L
-  #define THRUST_CPP11
-  #define THRUST_CPP_DIALECT 2011
-#elif __cplusplus < 201703L
-  #define THRUST_CPP14
-  #define THRUST_CPP_DIALECT 2014
-#else
-  #define THRUST_CPP17
-  #define THRUST_CPP_DIALECT 2017
+#include <thrust/detail/config/compiler.h>
+
+// Deprecation warnings may be silenced by defining the following macros. These
+// may be combined.
+// - THRUST_IGNORE_DEPRECATED_CPP_DIALECT:
+//   Ignore all deprecated C++ dialects and outdated compilers.
+// - THRUST_IGNORE_DEPRECATED_CPP_11:
+//   Ignore deprecation warnings when compiling with C++11. C++03 and outdated
+//   compilers will still issue warnings.
+// - THRUST_IGNORE_DEPRECATED_COMPILER
+//   Ignore deprecation warnings when using deprecated compilers. Compiling
+//   with C++03 and C++11 will still issue warnings.
+
+// Check for the CUB opt-outs as well:
+#if !defined(THRUST_IGNORE_DEPRECATED_CPP_DIALECT) && \
+     defined(CUB_IGNORE_DEPRECATED_CPP_DIALECT)
+#  define    THRUST_IGNORE_DEPRECATED_CPP_DIALECT
+#endif
+#if !defined(THRUST_IGNORE_DEPRECATED_CPP_11) && \
+     defined(CUB_IGNORE_DEPRECATED_CPP_11)
+#  define    THRUST_IGNORE_DEPRECATED_CPP_11
+#endif
+#if !defined(THRUST_IGNORE_DEPRECATED_COMPILER) && \
+     defined(CUB_IGNORE_DEPRECATED_COMPILER)
+#  define    THRUST_IGNORE_DEPRECATED_COMPILER
+#endif
+
+#ifdef THRUST_IGNORE_DEPRECATED_CPP_DIALECT
+#  define THRUST_IGNORE_DEPRECATED_CPP_11
+#  define THRUST_IGNORE_DEPRECATED_COMPILER
 #endif
 
+// Define this to override the built-in detection.
+#ifndef THRUST_CPP_DIALECT
+
+// MSVC does not define __cplusplus correctly. _MSVC_LANG is used instead.
+// This macro is only defined in MSVC 2015U3+.
+#  ifdef _MSVC_LANG // Do not replace with THRUST_HOST_COMPILER test (see above)
+// MSVC2015 reports C++14 but lacks extended constexpr support. Treat as C++11.
+#    if THRUST_MSVC_VERSION < 1910 && _MSVC_LANG > 201103L /* MSVC < 2017 && CPP > 2011 */
+#      define THRUST_CPLUSPLUS 201103L /* Fix to 2011 */
+#    else
+#      define THRUST_CPLUSPLUS _MSVC_LANG /* We'll trust this for now. */
+#    endif // MSVC 2015 C++14 fix
+#  else
+#    define THRUST_CPLUSPLUS __cplusplus
+#  endif
+
+// Detect current dialect:
+#  if THRUST_CPLUSPLUS < 201103L
+#    define THRUST_CPP_DIALECT 2003
+#  elif THRUST_CPLUSPLUS < 201402L
+#    define THRUST_CPP_DIALECT 2011
+#  elif THRUST_CPLUSPLUS < 201703L
+#    define THRUST_CPP_DIALECT 2014
+#  elif THRUST_CPLUSPLUS == 201703L
+#    define THRUST_CPP_DIALECT 2017
+#  elif THRUST_CPLUSPLUS > 201703L // unknown, but is higher than 2017.
+#    define THRUST_CPP_DIALECT 2020
+#  endif
+
+#  undef THRUST_CPLUSPLUS // cleanup
+
+#endif // !THRUST_CPP_DIALECT
+
+// Define THRUST_COMPILER_DEPRECATION macro:
+#if THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC
+#  define THRUST_COMP_DEPR_IMPL(msg) \
+    __pragma(message(__FILE__ ":" THRUST_COMP_DEPR_IMPL0(__LINE__) ": warning: " #msg))
+#  define THRUST_COMP_DEPR_IMPL0(x) THRUST_COMP_DEPR_IMPL1(x)
+#  define THRUST_COMP_DEPR_IMPL1(x) #x
+#else // clang / gcc:
+#  define THRUST_COMP_DEPR_IMPL(msg) THRUST_COMP_DEPR_IMPL0(GCC warning #msg)
+#  define THRUST_COMP_DEPR_IMPL0(expr) _Pragma(#expr)
+#  define THRUST_COMP_DEPR_IMPL1 /* intentionally blank */
+#endif
+
+#define THRUST_COMPILER_DEPRECATION(REQ) \
+  THRUST_COMP_DEPR_IMPL(Thrust requires at least REQ. Define THRUST_IGNORE_DEPRECATED_CPP_DIALECT to suppress this message.)
+
+#define THRUST_COMPILER_DEPRECATION_SOFT(REQ, CUR) \
+  THRUST_COMP_DEPR_IMPL(Thrust requires at least REQ. CUR is deprecated but still supported. CUR support will be removed in a future release. Define THRUST_IGNORE_DEPRECATED_CPP_DIALECT to suppress this message.)
+
+#ifndef THRUST_IGNORE_DEPRECATED_COMPILER
+
+// Compiler checks:
+#  if THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC && THRUST_GCC_VERSION < 50000
+     THRUST_COMPILER_DEPRECATION(GCC 5.0);
+#  elif THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_CLANG && THRUST_CLANG_VERSION < 70000
+     THRUST_COMPILER_DEPRECATION(Clang 7.0);
+#  elif THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC && THRUST_MSVC_VERSION < 1910
+     // <2017. Hard upgrade message:
+     THRUST_COMPILER_DEPRECATION(MSVC 2019 (19.20/16.0/14.20));
+#  elif THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC && THRUST_MSVC_VERSION < 1920
+     // >=2017, <2019. Soft deprecation message:
+     THRUST_COMPILER_DEPRECATION_SOFT(MSVC 2019 (19.20/16.0/14.20), MSVC 2017);
+#  endif
+
+#endif // THRUST_IGNORE_DEPRECATED_COMPILER
+
+#ifndef THRUST_IGNORE_DEPRECATED_DIALECT
+
+// Dialect checks:
+#  if THRUST_CPP_DIALECT < 2011
+     // <C++11. Hard upgrade message:
+     THRUST_COMPILER_DEPRECATION(C++14);
+#  elif THRUST_CPP_DIALECT == 2011 && !defined(THRUST_IGNORE_DEPRECATED_CPP_11)
+     // =C++11. Soft upgrade message:
+     THRUST_COMPILER_DEPRECATION_SOFT(C++14, C++11);
+#  endif
+
+#endif // THRUST_IGNORE_DEPRECATED_DIALECT
+
+#undef THRUST_COMPILER_DEPRECATION_SOFT
+#undef THRUST_COMPILER_DEPRECATION
+#undef THRUST_COMP_DEPR_IMPL
+#undef THRUST_COMP_DEPR_IMPL0
+#undef THRUST_COMP_DEPR_IMPL1
diff --git a/thrust/detail/config/deprecated.h b/thrust/detail/config/deprecated.h
new file mode 100644
index 000000000..05851c676
--- /dev/null
+++ b/thrust/detail/config/deprecated.h
@@ -0,0 +1,42 @@
+/*
+ *  Copyright 2018-2020 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file deprecated.h
+ *  \brief Defines the THRUST_DEPRECATED macro
+ */
+
+#pragma once
+
+#include <thrust/detail/config/compiler.h>
+#include <thrust/detail/config/cpp_dialect.h>
+
+#if defined(CUB_IGNORE_DEPRECATED_API) && !defined(THRUST_IGNORE_DEPRECATED_API)
+#  define THRUST_IGNORE_DEPRECATED_API
+#endif
+
+#ifdef THRUST_IGNORE_DEPRECATED_API
+#  define THRUST_DEPRECATED
+#elif THRUST_CPP_DIALECT >= 2014
+#  define THRUST_DEPRECATED [[deprecated]]
+#elif THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC
+#  define THRUST_DEPRECATED __declspec(deprecated)
+#elif THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_CLANG
+#  define THRUST_DEPRECATED __attribute__((deprecated))
+#elif THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC
+#  define THRUST_DEPRECATED __attribute__((deprecated))
+#else
+#  define THRUST_DEPRECATED
+#endif
diff --git a/thrust/detail/config/device_system.h b/thrust/detail/config/device_system.h
index c4106d3fb..29418c903 100644
--- a/thrust/detail/config/device_system.h
+++ b/thrust/detail/config/device_system.h
@@ -26,25 +26,8 @@
 #define THRUST_DEVICE_SYSTEM THRUST_DEVICE_SYSTEM_CUDA
 #endif // THRUST_DEVICE_SYSTEM
 
-// XXX make the use of THRUST_DEVICE_BACKEND an error in Thrust 1.7
-// XXX eliminate the following in Thrust 1.7
-
-#define THRUST_DEVICE_BACKEND_CUDA THRUST_DEVICE_SYSTEM_CUDA
-#define THRUST_DEVICE_BACKEND_OMP  THRUST_DEVICE_SYSTEM_OMP
-#define THRUST_DEVICE_BACKEND_TBB  THRUST_DEVICE_SYSTEM_TBB
-
 #ifdef THRUST_DEVICE_BACKEND
-#  if THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC
-#    pragma message("----------------------------------------------------------------------------------")
-#    pragma message("| WARNING: THRUST_DEVICE_BACKEND is deprecated; use THRUST_DEVICE_SYSTEM instead |")
-#    pragma message("----------------------------------------------------------------------------------")
-#  else
-#    warning ----------------------------------------------------------------------------------
-#    warning | WARNING: THRUST_DEVICE_BACKEND is deprecated; use THRUST_DEVICE_SYSTEM instead |
-#    warning ----------------------------------------------------------------------------------
-#  endif // THRUST_HOST_COMPILER
-#  undef THRUST_DEVICE_SYSTEM
-#  define THRUST_DEVICE_SYSTEM THRUST_DEVICE_BACKEND
+#  error THRUST_DEVICE_BACKEND is no longer supported; use THRUST_DEVICE_SYSTEM instead.
 #endif // THRUST_DEVICE_BACKEND
 
 #if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
diff --git a/thrust/detail/config/exec_check_disable.h b/thrust/detail/config/exec_check_disable.h
index dcadaf141..9b25b375d 100644
--- a/thrust/detail/config/exec_check_disable.h
+++ b/thrust/detail/config/exec_check_disable.h
@@ -22,9 +22,17 @@
 
 #include <thrust/detail/config.h>
 
-#if defined(__CUDACC__) && !(defined(__CUDA__) && defined(__clang__))
-
-#define __thrust_exec_check_disable__ #pragma nv_exec_check_disable
+// #pragma nv_exec_check_disable is only recognized by NVCC.  Having a macro
+// expand to a #pragma (rather than _Pragma) only works with NVCC's compilation
+// model, not with other compilers.
+#if defined(__CUDACC__) && !defined(_NVHPC_CUDA) && \
+    !(defined(__CUDA__) && defined(__clang__))
+
+#if THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC
+#define __thrust_exec_check_disable__ __pragma("nv_exec_check_disable")
+#else // MSVC
+#define __thrust_exec_check_disable__ _Pragma("nv_exec_check_disable")
+#endif // MSVC
 
 #else
 
diff --git a/thrust/detail/config/forceinline.h b/thrust/detail/config/forceinline.h
index 664130425..b001fd4b1 100644
--- a/thrust/detail/config/forceinline.h
+++ b/thrust/detail/config/forceinline.h
@@ -22,7 +22,7 @@
 
 #include <thrust/detail/config.h>
 
-#if defined(__CUDACC__)
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
 
 #define __thrust_forceinline__ __forceinline__
 
diff --git a/thrust/detail/config/global_workarounds.h b/thrust/detail/config/global_workarounds.h
index a9015e846..9800f0359 100644
--- a/thrust/detail/config/global_workarounds.h
+++ b/thrust/detail/config/global_workarounds.h
@@ -20,7 +20,7 @@
 
 // XXX workaround gcc 4.8+'s complaints about unused local typedefs by silencing them globally
 #if defined(THRUST_GCC_VERSION) && (THRUST_GCC_VERSION >= 40800)
-#  if defined(__NVCC__) && (CUDA_VERSION >= 6000)
+#  if defined(__NVCC__) && (CUDART_VERSION >= 6000)
 #    pragma GCC diagnostic ignored "-Wunused-local-typedefs"
 #  endif // nvcc & cuda 6+
 #endif // gcc 4.8
diff --git a/thrust/detail/config/host_system.h b/thrust/detail/config/host_system.h
index 5c1387803..f216f6492 100644
--- a/thrust/detail/config/host_system.h
+++ b/thrust/detail/config/host_system.h
@@ -25,25 +25,8 @@
 #define THRUST_HOST_SYSTEM THRUST_HOST_SYSTEM_CPP
 #endif // THRUST_HOST_SYSTEM
 
-// XXX make the use of THRUST_HOST_BACKEND an error in Thrust 1.7
-// XXX eliminate the following in Thrust 1.7
-
-#define THRUST_HOST_BACKEND_CPP THRUST_HOST_SYSTEM_CPP
-#define THRUST_HOST_BACKEND_OMP THRUST_HOST_SYSTEM_OMP
-#define THRUST_HOST_BACKEND_TBB THRUST_HOST_SYSTEM_TBB
-
 #ifdef THRUST_HOST_BACKEND
-#  if THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC
-#    pragma message("------------------------------------------------------------------------------")
-#    pragma message("| WARNING: THRUST_HOST_BACKEND is deprecated; use THRUST_HOST_SYSTEM instead |")
-#    pragma message("------------------------------------------------------------------------------")
-#  else
-#    warning ------------------------------------------------------------------------------
-#    warning | WARNING: THRUST_HOST_BACKEND is deprecated; use THRUST_HOST_SYSTEM instead |
-#    warning ------------------------------------------------------------------------------
-#  endif // THRUST_HOST_COMPILER
-#  undef THRUST_HOST_SYSTEM
-#  define THRUST_HOST_SYSTEM THRUST_HOST_BACKEND
+#  error THRUST_HOST_BACKEND is no longer supported; use THRUST_HOST_SYSTEM instead.
 #endif // THRUST_HOST_BACKEND
 
 #if THRUST_HOST_SYSTEM == THRUST_HOST_SYSTEM_CPP
diff --git a/thrust/mr/detail/config.h b/thrust/detail/config/memory_resource.h
similarity index 89%
rename from thrust/mr/detail/config.h
rename to thrust/detail/config/memory_resource.h
index 3f4795026..ab719c9bd 100644
--- a/thrust/mr/detail/config.h
+++ b/thrust/detail/config/memory_resource.h
@@ -22,9 +22,9 @@
 #include <thrust/detail/alignment.h>
 #include <thrust/detail/config/cpp_compatibility.h>
 
-#define THRUST_MR_DEFAULT_ALIGNMENT THRUST_ALIGNOF(::thrust::detail::max_align_t)
+#define THRUST_MR_DEFAULT_ALIGNMENT THRUST_ALIGNOF(THRUST_NS_QUALIFIER::detail::max_align_t)
 
-#if __cplusplus >= 201703L
+#if THRUST_CPP_DIALECT >= 2017
 #  if __has_include(<memory_resource>)
 #    define THRUST_MR_STD_MR_HEADER <memory_resource>
 #    define THRUST_MR_STD_MR_NS std::pmr
@@ -33,4 +33,3 @@
 #    define THRUST_MR_STD_MR_NS std::experimental::pmr
 #  endif
 #endif
-
diff --git a/thrust/detail/config/namespace.h b/thrust/detail/config/namespace.h
new file mode 100644
index 000000000..9c7904616
--- /dev/null
+++ b/thrust/detail/config/namespace.h
@@ -0,0 +1,120 @@
+/*
+ *  Copyright 2021 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+/**
+ * \file namespace.h
+ * \brief Utilities that allow `thrust::` to be placed inside an
+ * application-specific namespace.
+ */
+
+/**
+ * \def THRUST_CUB_WRAPPED_NAMESPACE
+ * If defined, this value will be used as the name of a namespace that wraps the
+ * `thrust::` and `cub::` namespaces.
+ * This macro should not be used with any other Thrust namespace macros.
+ */
+#ifdef THRUST_CUB_WRAPPED_NAMESPACE
+#define THRUST_WRAPPED_NAMESPACE THRUST_CUB_WRAPPED_NAMESPACE
+#endif
+
+/**
+ * \def THRUST_WRAPPED_NAMESPACE
+ * If defined, this value will be used as the name of a namespace that wraps the
+ * `thrust::` namespace.
+ * If THRUST_CUB_WRAPPED_NAMESPACE is set, this will inherit that macro's value.
+ * This macro should not be used with any other Thrust namespace macros.
+ */
+#ifdef THRUST_WRAPPED_NAMESPACE
+#define THRUST_NS_PREFIX                                                       \
+  namespace THRUST_WRAPPED_NAMESPACE                                           \
+  {
+
+#define THRUST_NS_POSTFIX }
+
+#define THRUST_NS_QUALIFIER ::THRUST_WRAPPED_NAMESPACE::thrust
+#endif
+
+/**
+ * \def THRUST_NS_PREFIX
+ * This macro is inserted prior to all `namespace thrust { ... }` blocks. It is
+ * derived from THRUST_WRAPPED_NAMESPACE, if set, and will be empty otherwise.
+ * It may be defined by users, in which case THRUST_NS_PREFIX,
+ * THRUST_NS_POSTFIX, and THRUST_NS_QUALIFIER must all be set consistently.
+ */
+#ifndef THRUST_NS_PREFIX
+#define THRUST_NS_PREFIX
+#endif
+
+/**
+ * \def THRUST_NS_POSTFIX
+ * This macro is inserted following the closing braces of all
+ * `namespace thrust { ... }` block. It is defined appropriately when
+ * THRUST_WRAPPED_NAMESPACE is set, and will be empty otherwise. It may be
+ * defined by users, in which case THRUST_NS_PREFIX, THRUST_NS_POSTFIX, and
+ * THRUST_NS_QUALIFIER must all be set consistently.
+ */
+#ifndef THRUST_NS_POSTFIX
+#define THRUST_NS_POSTFIX
+#endif
+
+/**
+ * \def THRUST_NS_QUALIFIER
+ * This macro is used to qualify members of thrust:: when accessing them from
+ * outside of their namespace. By default, this is just `::thrust`, and will be
+ * set appropriately when THRUST_WRAPPED_NAMESPACE is defined. This macro may be
+ * defined by users, in which case THRUST_NS_PREFIX, THRUST_NS_POSTFIX, and
+ * THRUST_NS_QUALIFIER must all be set consistently.
+ */
+#ifndef THRUST_NS_QUALIFIER
+#define THRUST_NS_QUALIFIER ::thrust
+#endif
+
+/**
+ * \def THRUST_NAMESPACE_BEGIN
+ * This macro is used to open a `thrust::` namespace block, along with any
+ * enclosing namespaces requested by THRUST_WRAPPED_NAMESPACE, etc.
+ * This macro is defined by Thrust and may not be overridden.
+ */
+#define THRUST_NAMESPACE_BEGIN                                                 \
+  THRUST_NS_PREFIX                                                             \
+  namespace thrust                                                             \
+  {
+
+/**
+ * \def THRUST_NAMESPACE_END
+ * This macro is used to close a `thrust::` namespace block, along with any
+ * enclosing namespaces requested by THRUST_WRAPPED_NAMESPACE, etc.
+ * This macro is defined by Thrust and may not be overridden.
+ */
+#define THRUST_NAMESPACE_END                                                   \
+  } /* end namespace thrust */                                                 \
+  THRUST_NS_POSTFIX
+
+// The following is just here to add docs for the thrust namespace:
+
+THRUST_NS_PREFIX
+
+/*! \namespace thrust
+ *  \brief \p thrust is the top-level namespace which contains all Thrust
+ *         functions and types.
+ */
+namespace thrust
+{
+}
+
+THRUST_NS_POSTFIX
diff --git a/thrust/detail/config/simple_defines.h b/thrust/detail/config/simple_defines.h
index 369fa6da5..e3ea2eb64 100644
--- a/thrust/detail/config/simple_defines.h
+++ b/thrust/detail/config/simple_defines.h
@@ -24,5 +24,7 @@
 #define THRUST_FALSE   0
 #define THRUST_TRUE    1
 
+#define THRUST_UNUSED_VAR(expr) do { (void)(expr); } while (0)
+
 #define THRUST_PREVENT_MACRO_SUBSTITUTION
 
diff --git a/thrust/detail/contiguous_storage.h b/thrust/detail/contiguous_storage.h
index 378cfb815..536c1c27c 100644
--- a/thrust/detail/contiguous_storage.h
+++ b/thrust/detail/contiguous_storage.h
@@ -19,9 +19,9 @@
 #include <thrust/iterator/detail/normal_iterator.h>
 #include <thrust/detail/execution_policy.h>
 #include <thrust/detail/allocator/allocator_traits.h>
+#include <thrust/detail/config.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace detail
 {
@@ -42,14 +42,8 @@ template<typename T, typename Alloc>
     typedef typename alloc_traits::const_pointer       const_pointer;
     typedef typename alloc_traits::size_type           size_type;
     typedef typename alloc_traits::difference_type     difference_type;
-
-    // XXX we should bring reference & const_reference into allocator_traits
-    //     at the moment, it's unclear how -- we have nothing analogous to
-    //     rebind_pointer for references
-    //     we either need to add reference_traits or extend the existing
-    //     pointer_traits to support wrapped references
-    typedef typename Alloc::reference                  reference;
-    typedef typename Alloc::const_reference            const_reference;
+    typedef typename alloc_traits::reference           reference;
+    typedef typename alloc_traits::const_reference     const_reference;
 
     typedef thrust::detail::normal_iterator<pointer>       iterator;
     typedef thrust::detail::normal_iterator<const_pointer> const_iterator;
@@ -167,7 +161,7 @@ template<typename T, typename Alloc>
     __host__ __device__
     void propagate_allocator(const contiguous_storage &other);
 
-#if __cplusplus >= 201103L
+#if THRUST_CPP_DIALECT >= 2011
     __host__ __device__
     void propagate_allocator(contiguous_storage &other);
 
@@ -220,7 +214,7 @@ template<typename T, typename Alloc>
     __host__ __device__
     void propagate_allocator_dispatch(false_type, const contiguous_storage &other);
 
-#if __cplusplus >= 201103L
+#if THRUST_CPP_DIALECT >= 2011
     __host__ __device__
     void propagate_allocator_dispatch(true_type, contiguous_storage &other);
 
@@ -235,7 +229,7 @@ template<typename T, typename Alloc>
 __host__ __device__
 void swap(detail::contiguous_storage<T,Alloc> &lhs, detail::contiguous_storage<T,Alloc> &rhs);
 
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/contiguous_storage.inl>
 
diff --git a/thrust/detail/contiguous_storage.inl b/thrust/detail/contiguous_storage.inl
index 2556260f2..7ae8657f0 100644
--- a/thrust/detail/contiguous_storage.inl
+++ b/thrust/detail/contiguous_storage.inl
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
 #include <thrust/detail/contiguous_storage.h>
 #include <thrust/detail/swap.h>
 #include <thrust/detail/allocator/allocator_traits.h>
@@ -23,10 +24,13 @@
 #include <thrust/detail/allocator/default_construct_range.h>
 #include <thrust/detail/allocator/destroy_range.h>
 #include <thrust/detail/allocator/fill_construct_range.h>
+
+#include <nv/target>
+
+#include <stdexcept> // for std::runtime_error
 #include <utility> // for use of std::swap in the WAR below
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace detail
 {
@@ -186,6 +190,7 @@ __host__ __device__
   return m_begin[n];
 } // end contiguous_storage::operator[]()
 
+__thrust_exec_check_disable__
 template<typename T, typename Alloc>
 __host__ __device__
   typename contiguous_storage<T,Alloc>::allocator_type
@@ -340,6 +345,7 @@ __host__ __device__
   destroy_on_allocator_mismatch_dispatch(c, other, first, last);
 } // end contiguous_storage::destroy_on_allocator_mismatch
 
+__thrust_exec_check_disable__
 template<typename T, typename Alloc>
 __host__ __device__
   void contiguous_storage<T,Alloc>
@@ -382,7 +388,7 @@ __host__ __device__
   propagate_allocator_dispatch(c, other);
 } // end contiguous_storage::propagate_allocator()
 
-#if __cplusplus >= 201103L
+#if THRUST_CPP_DIALECT >= 2011
 template<typename T, typename Alloc>
 __host__ __device__
   void contiguous_storage<T,Alloc>
@@ -428,15 +434,16 @@ __host__ __device__
   void contiguous_storage<T,Alloc>
     ::swap_allocators(false_type, Alloc &other)
 {
-#ifdef __CUDA_ARCH__
-  // allocators must be equal when swapping containers with allocators that propagate on swap
-  assert(!is_allocator_not_equal(other));
-#else
-  if (is_allocator_not_equal(other))
-  {
-    throw allocator_mismatch_on_swap();
-  }
-#endif
+  NV_IF_TARGET(NV_IS_DEVICE, (
+    // allocators must be equal when swapping containers with allocators that propagate on swap
+    assert(!is_allocator_not_equal(other));
+  ), (
+    if (is_allocator_not_equal(other))
+    {
+      throw allocator_mismatch_on_swap();
+    }
+  ));
+
   thrust::swap(m_allocator, other);
 } // end contiguous_storage::swap_allocators()
 
@@ -448,6 +455,7 @@ __host__ __device__
   return false;
 } // end contiguous_storage::is_allocator_not_equal_dispatch()
 
+__thrust_exec_check_disable__
 template<typename T, typename Alloc>
 __host__ __device__
   bool contiguous_storage<T,Alloc>
@@ -456,6 +464,7 @@ __host__ __device__
   return m_allocator != other;
 } // end contiguous_storage::is_allocator_not_equal_dispatch()
 
+__thrust_exec_check_disable__
 template<typename T, typename Alloc>
 __host__ __device__
   void contiguous_storage<T,Alloc>
@@ -474,6 +483,7 @@ __host__ __device__
 {
 } // end contiguous_storage::deallocate_on_allocator_mismatch()
 
+__thrust_exec_check_disable__
 template<typename T, typename Alloc>
 __host__ __device__
   void contiguous_storage<T,Alloc>
@@ -494,6 +504,7 @@ __host__ __device__
 {
 } // end contiguous_storage::destroy_on_allocator_mismatch()
 
+__thrust_exec_check_disable__
 template<typename T, typename Alloc>
 __host__ __device__
   void contiguous_storage<T,Alloc>
@@ -509,7 +520,8 @@ __host__ __device__
 {
 } // end contiguous_storage::propagate_allocator()
 
-#if __cplusplus >= 201103L
+#if THRUST_CPP_DIALECT >= 2011
+__thrust_exec_check_disable__
 template<typename T, typename Alloc>
 __host__ __device__
   void contiguous_storage<T,Alloc>
@@ -535,5 +547,4 @@ __host__ __device__
   lhs.swap(rhs);
 } // end swap()
 
-} // end thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/copy.h b/thrust/detail/copy.h
index 5e9feb0f9..d6c5bc805 100644
--- a/thrust/detail/copy.h
+++ b/thrust/detail/copy.h
@@ -19,8 +19,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/detail/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 template<typename System,
          typename InputIterator,
@@ -85,7 +84,7 @@ __host__ __device__
 
 
 } // end detail
-} // end thrust
 
-#include <thrust/detail/copy.inl>
+THRUST_NAMESPACE_END
 
+#include <thrust/detail/copy.inl>
diff --git a/thrust/detail/copy.inl b/thrust/detail/copy.inl
index 85701fde7..4d62798c7 100644
--- a/thrust/detail/copy.inl
+++ b/thrust/detail/copy.inl
@@ -14,6 +14,7 @@
  *  limitations under the License.
  */
 
+#pragma once
 
 #include <thrust/detail/config.h>
 #include <thrust/detail/copy.h>
@@ -21,9 +22,7 @@
 #include <thrust/system/detail/generic/copy.h>
 #include <thrust/system/detail/adl/copy.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 __thrust_exec_check_disable__
 template<typename DerivedPolicy, typename InputIterator, typename OutputIterator>
@@ -127,6 +126,4 @@ template<typename InputIterator,
   return thrust::detail::two_system_copy_n(system1, system2, first, n, result);
 } // end copy_n()
 
-
-} // end namespace thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/copy_if.h b/thrust/detail/copy_if.h
index 563623c88..32eb5e083 100644
--- a/thrust/detail/copy_if.h
+++ b/thrust/detail/copy_if.h
@@ -19,9 +19,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/detail/execution_policy.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 template<typename DerivedPolicy,
          typename InputIterator,
@@ -68,8 +66,6 @@ template<typename InputIterator1,
                          OutputIterator result,
                          Predicate pred);
 
-
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/copy_if.inl>
-
diff --git a/thrust/detail/copy_if.inl b/thrust/detail/copy_if.inl
index f4c22f8a5..952541c51 100644
--- a/thrust/detail/copy_if.inl
+++ b/thrust/detail/copy_if.inl
@@ -14,6 +14,8 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/detail/copy_if.h>
 #include <thrust/iterator/iterator_traits.h>
@@ -21,9 +23,7 @@
 #include <thrust/system/detail/generic/select_system.h>
 #include <thrust/system/detail/adl/copy_if.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 __thrust_exec_check_disable__
 template<typename DerivedPolicy,
@@ -104,6 +104,4 @@ template<typename InputIterator1,
   return thrust::copy_if(select_system(system1,system2,system3), first, last, stencil, result, pred);
 } // end copy_if()
 
-
-} // end thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/count.h b/thrust/detail/count.h
new file mode 100644
index 000000000..7c48bc546
--- /dev/null
+++ b/thrust/detail/count.h
@@ -0,0 +1,60 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+
+THRUST_NAMESPACE_BEGIN
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename EqualityComparable>
+__host__ __device__
+  typename thrust::iterator_traits<InputIterator>::difference_type
+    count(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+          InputIterator first,
+          InputIterator last,
+          const EqualityComparable& value);
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename Predicate>
+__host__ __device__
+  typename thrust::iterator_traits<InputIterator>::difference_type
+    count_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+             InputIterator first,
+             InputIterator last,
+             Predicate pred);
+
+template <typename InputIterator,
+          typename EqualityComparable>
+  typename thrust::iterator_traits<InputIterator>::difference_type
+    count(InputIterator first,
+          InputIterator last,
+          const EqualityComparable& value);
+
+template <typename InputIterator,
+          typename Predicate>
+  typename thrust::iterator_traits<InputIterator>::difference_type
+    count_if(InputIterator first,
+             InputIterator last,
+             Predicate pred);
+
+THRUST_NAMESPACE_END
+
+#include <thrust/detail/count.inl>
diff --git a/thrust/detail/count.inl b/thrust/detail/count.inl
index f7ba7a54e..5d1f628a9 100644
--- a/thrust/detail/count.inl
+++ b/thrust/detail/count.inl
@@ -14,10 +14,7 @@
  *  limitations under the License.
  */
 
-
-/*! \file count.inl
- *  \brief Inline file for count.h.
- */
+#pragma once
 
 #include <thrust/detail/config.h>
 #include <thrust/count.h>
@@ -26,9 +23,7 @@
 #include <thrust/system/detail/generic/count.h>
 #include <thrust/system/detail/adl/count.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 __thrust_exec_check_disable__
 template<typename DerivedPolicy, typename InputIterator, typename EqualityComparable>
@@ -79,6 +74,4 @@ count_if(InputIterator first, InputIterator last, Predicate pred)
   return thrust::count_if(select_system(system), first, last, pred);
 } // end count_if()
 
-
-} // end namespace thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/device_vector.inl b/thrust/detail/cpp14_required.h
similarity index 56%
rename from thrust/detail/device_vector.inl
rename to thrust/detail/cpp14_required.h
index e59b5670e..083c8a1ad 100644
--- a/thrust/detail/device_vector.inl
+++ b/thrust/detail/cpp14_required.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2018 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -14,25 +14,13 @@
  *  limitations under the License.
  */
 
+#pragma once
 
-/*! \file device_vector.inl
- *  \brief Inline file for device_vector.h.
- */
-
-#include <thrust/host_vector.h>
-
-namespace thrust
-{
-
-template<typename T, typename Alloc>
-  template<typename OtherT, typename OtherAlloc>
-    __host__
-    device_vector<T,Alloc>
-      ::device_vector(const host_vector<OtherT,OtherAlloc> &v)
-        :Parent(v)
-{
-  ;
-} // end device_vector::device_vector()
+#include <thrust/detail/config/cpp_dialect.h>
 
-} // end namespace thrust
+#ifndef THRUST_CPP14_REQUIRED_NO_ERROR
+#  if THRUST_CPP_DIALECT < 2014
+#    error C++14 is required for this Thrust feature; please upgrade your compiler or pass the appropriate -std=c++14 flag to it.
+#  endif
+#endif
 
diff --git a/thrust/detail/cstdint.h b/thrust/detail/cstdint.h
index 248390a52..f41e11475 100644
--- a/thrust/detail/cstdint.h
+++ b/thrust/detail/cstdint.h
@@ -16,12 +16,16 @@
 
 #pragma once
 
-#if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC) || (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_CLANG)
+#include <thrust/detail/config.h>
+
+#if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC) || \
+    (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_CLANG) || \
+    (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_INTEL)
 #include <stdint.h>
 #endif
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
+
 namespace detail
 {
 
@@ -75,5 +79,5 @@ typedef divine_intptr_t<>::type   intptr_t;
 typedef divine_uintptr_t<>::type  uintptr_t;
 
 } // end detail
-} // end thrust
 
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/dependencies_aware_execution_policy.h b/thrust/detail/dependencies_aware_execution_policy.h
index ca6092bfd..a7567a3fa 100644
--- a/thrust/detail/dependencies_aware_execution_policy.h
+++ b/thrust/detail/dependencies_aware_execution_policy.h
@@ -25,8 +25,8 @@
 
 #include <thrust/detail/execute_with_dependencies.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
+
 namespace detail
 {
 
@@ -34,14 +34,11 @@ template<template<typename> class ExecutionPolicyCRTPBase>
 struct dependencies_aware_execution_policy
 {
     template<typename ...Dependencies>
-    using execute_with_dependencies_type = thrust::detail::execute_with_dependencies<
+    __host__
+    thrust::detail::execute_with_dependencies<
         ExecutionPolicyCRTPBase,
         Dependencies...
-    >;
-
-    template<typename ...Dependencies>
-    __host__
-    execute_with_dependencies_type<Dependencies...>
+    >
     after(Dependencies&& ...dependencies) const
     {
         return { capture_as_dependency(THRUST_FWD(dependencies))... };
@@ -49,14 +46,20 @@ struct dependencies_aware_execution_policy
 
     template<typename ...Dependencies>
     __host__
-    execute_with_dependencies_type<Dependencies...>
+    thrust::detail::execute_with_dependencies<
+        ExecutionPolicyCRTPBase,
+        Dependencies...
+    >
     after(std::tuple<Dependencies...>& dependencies) const
     {
         return { capture_as_dependency(dependencies) };
     }
     template<typename ...Dependencies>
     __host__
-    execute_with_dependencies_type<Dependencies...>
+    thrust::detail::execute_with_dependencies<
+        ExecutionPolicyCRTPBase,
+        Dependencies...
+    >
     after(std::tuple<Dependencies...>&& dependencies) const
     {
         return { capture_as_dependency(std::move(dependencies)) };
@@ -64,7 +67,10 @@ struct dependencies_aware_execution_policy
 
     template<typename ...Dependencies>
     __host__
-    execute_with_dependencies_type<Dependencies...>
+    thrust::detail::execute_with_dependencies<
+        ExecutionPolicyCRTPBase,
+        Dependencies...
+    >
     rebind_after(Dependencies&& ...dependencies) const
     {
         return { capture_as_dependency(THRUST_FWD(dependencies))... };
@@ -72,14 +78,20 @@ struct dependencies_aware_execution_policy
 
     template<typename ...Dependencies>
     __host__
-    execute_with_dependencies_type<Dependencies...>
+    thrust::detail::execute_with_dependencies<
+        ExecutionPolicyCRTPBase,
+        Dependencies...
+    >
     rebind_after(std::tuple<Dependencies...>& dependencies) const
     {
         return { capture_as_dependency(dependencies) };
     }
     template<typename ...Dependencies>
     __host__
-    execute_with_dependencies_type<Dependencies...>
+    thrust::detail::execute_with_dependencies<
+        ExecutionPolicyCRTPBase,
+        Dependencies...
+    >
     rebind_after(std::tuple<Dependencies...>&& dependencies) const
     {
         return { capture_as_dependency(std::move(dependencies)) };
@@ -87,7 +99,8 @@ struct dependencies_aware_execution_policy
 };
 
 } // end detail
-} // end thrust
+
+THRUST_NAMESPACE_END
 
 #endif // THRUST_CPP_DIALECT >= 2011
 
diff --git a/thrust/detail/device_delete.inl b/thrust/detail/device_delete.inl
index f1a67f91b..87f73aad9 100644
--- a/thrust/detail/device_delete.inl
+++ b/thrust/detail/device_delete.inl
@@ -14,17 +14,15 @@
  *  limitations under the License.
  */
 
+#pragma once
 
-/*! \file device_delete.inl
- *  \brief Inline file for device_delete.h.
- */
-
+#include <thrust/detail/config.h>
 #include <thrust/device_delete.h>
 #include <thrust/device_free.h>
 #include <thrust/detail/allocator/destroy_range.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
+
 namespace detail
 {
 
@@ -43,5 +41,4 @@ template<typename T>
   thrust::device_free(ptr);
 } // end device_delete()
 
-} // end thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/device_free.inl b/thrust/detail/device_free.inl
index 7a1b6c123..806802e16 100644
--- a/thrust/detail/device_free.inl
+++ b/thrust/detail/device_free.inl
@@ -14,10 +14,7 @@
  *  limitations under the License.
  */
 
-
-/*! \file device_free.inl
- *  \brief Inline file for device_free.h.
- */
+#pragma once
 
 #include <thrust/detail/config.h>
 #include <thrust/device_free.h>
@@ -25,8 +22,7 @@
 #include <thrust/system/detail/generic/select_system.h>
 #include <thrust/detail/malloc_and_free.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 void device_free(thrust::device_ptr<void> ptr)
 {
@@ -40,5 +36,4 @@ void device_free(thrust::device_ptr<void> ptr)
   thrust::free(s, ptr);
 } // end device_free()
 
-} // end thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/device_malloc.inl b/thrust/detail/device_malloc.inl
index 938c3c807..f4222f51d 100644
--- a/thrust/detail/device_malloc.inl
+++ b/thrust/detail/device_malloc.inl
@@ -14,10 +14,7 @@
  *  limitations under the License.
  */
 
-
-/*! \file device_malloc.inl
- *  \brief Inline file for device_malloc.h.
- */
+#pragma once
 
 #include <thrust/detail/config.h>
 #include <thrust/device_malloc.h>
@@ -25,9 +22,7 @@
 #include <thrust/system/detail/generic/select_system.h>
 #include <thrust/detail/malloc_and_free.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 thrust::device_ptr<void> device_malloc(const std::size_t n)
 {
@@ -55,6 +50,4 @@ template<typename T>
   return thrust::device_ptr<T>(thrust::malloc<T>(s,n).get());
 } // end device_malloc()
 
-
-} // end thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/device_new.inl b/thrust/detail/device_new.inl
index 2551badb4..c66e2cbff 100644
--- a/thrust/detail/device_new.inl
+++ b/thrust/detail/device_new.inl
@@ -14,17 +14,14 @@
  *  limitations under the License.
  */
 
+#pragma once
 
-/*! \file device_new.inl
- *  \brief Inline file for device_new.h.
- */
-
+#include <thrust/detail/config.h>
 #include <thrust/device_new.h>
 #include <thrust/device_malloc.h>
 #include <thrust/uninitialized_fill.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 template<typename T>
   device_ptr<T> device_new(device_ptr<void> p,
@@ -45,7 +42,7 @@ template<typename T>
 
   // run copy constructors at p here
   thrust::uninitialized_fill(result, result + n, exemplar);
-  
+
   return result;
 } // end device_new()
 
@@ -56,5 +53,4 @@ template<typename T>
   return device_new<T>(thrust::device_malloc<T>(n));
 } // end device_new()
 
-} // thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/device_ptr.inl b/thrust/detail/device_ptr.inl
index d1058ca6a..361c61f33 100644
--- a/thrust/detail/device_ptr.inl
+++ b/thrust/detail/device_ptr.inl
@@ -14,18 +14,15 @@
  *  limitations under the License.
  */
 
-
-/*! \file device_ptr.inl
- *  \brief Inline file for device_ptr.h.
- */
+#pragma once
 
 #include <thrust/device_ptr.h>
 #include <thrust/device_reference.h>
+#include <thrust/detail/config.h>
 #include <thrust/detail/type_traits.h>
 #include <thrust/iterator/iterator_traits.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 template<typename T>
   __host__ __device__
@@ -63,5 +60,5 @@ template<typename T>
 
 
 } // end namespace detail
-} // end namespace thrust
 
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/device_reference.inl b/thrust/detail/device_reference.inl
deleted file mode 100644
index 07f6af726..000000000
--- a/thrust/detail/device_reference.inl
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file device_reference.inl
- *  \brief Inline file for device_reference.h.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/device_reference.h>
-
-namespace thrust
-{
-
-template<typename T>
-  template<typename OtherT>
-    __host__ __device__
-    device_reference<T> &
-      device_reference<T>
-        ::operator=(const device_reference<OtherT> &other)
-{
-  return super_t::operator=(other);
-} // end operator=()
-
-template<typename T>
-  __host__ __device__
-  device_reference<T> &
-    device_reference<T>
-      ::operator=(const value_type &x)
-{
-  return super_t::operator=(x);
-} // end operator=()
-
-template<typename T>
-__host__ __device__
-void swap(device_reference<T> a, device_reference<T> b)
-{
-  a.swap(b);
-} // end swap()
-
-} // end thrust
-
diff --git a/thrust/detail/distance.inl b/thrust/detail/distance.inl
index 5732a9c25..6702c2b6f 100644
--- a/thrust/detail/distance.inl
+++ b/thrust/detail/distance.inl
@@ -14,19 +14,16 @@
  *  limitations under the License.
  */
 
-
-/*! \file distance.inl
- *  \brief Inline file for distance.h
- */
+#pragma once
 
 #include <thrust/advance.h>
+#include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/distance.h>
 #include <thrust/iterator/iterator_traits.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
+__thrust_exec_check_disable__
 template<typename InputIterator>
 inline __host__ __device__
   typename thrust::iterator_traits<InputIterator>::difference_type
@@ -35,6 +32,4 @@ inline __host__ __device__
   return thrust::system::detail::generic::distance(first, last);
 } // end distance()
 
-
-} // end namespace thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/equal.inl b/thrust/detail/equal.inl
index 08bfbab0b..e21ddfa5a 100644
--- a/thrust/detail/equal.inl
+++ b/thrust/detail/equal.inl
@@ -14,20 +14,16 @@
  *  limitations under the License.
  */
 
+#pragma once
 
-/*! \file equal.inl
- *  \brief Inline file for equal.h.
- */
-
+#include <thrust/detail/config.h>
 #include <thrust/equal.h>
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/system/detail/generic/select_system.h>
 #include <thrust/system/detail/generic/equal.h>
 #include <thrust/system/detail/adl/equal.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 __thrust_exec_check_disable__
 template<typename System, typename InputIterator1, typename InputIterator2>
@@ -65,7 +61,7 @@ bool equal(InputIterator1 first1, InputIterator1 last1,
 }
 
 
-template <typename InputIterator1, typename InputIterator2, 
+template <typename InputIterator1, typename InputIterator2,
           typename BinaryPredicate>
 bool equal(InputIterator1 first1, InputIterator1 last1,
            InputIterator2 first2, BinaryPredicate binary_pred)
@@ -81,6 +77,4 @@ bool equal(InputIterator1 first1, InputIterator1 last1,
   return thrust::equal(select_system(system1,system2), first1, last1, first2, binary_pred);
 }
 
-
-} // end namespace thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/event_error.h b/thrust/detail/event_error.h
index 9f576a12a..b928e0650 100644
--- a/thrust/detail/event_error.h
+++ b/thrust/detail/event_error.h
@@ -20,17 +20,16 @@
 #pragma once
 
 #include <thrust/detail/config.h>
-#include <thrust/detail/cpp11_required.h>
-#include <thrust/detail/modern_gcc_required.h>
+#include <thrust/detail/cpp14_required.h>
 
-#if THRUST_CPP_DIALECT >= 2011 && !defined(THRUST_LEGACY_GCC)
+#if THRUST_CPP_DIALECT >= 2014
 
 #include <thrust/detail/type_traits.h>
 #include <thrust/system/error_code.h>
 
 #include <stdexcept>
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
 
 enum class event_errc
 {
@@ -64,7 +63,7 @@ struct event_error_category : error_category
         return "no_state: an operation that requires an event or future to have "
                "a stream or content has been performed on a event or future "
                "without either, e.g. a moved-from or default constructed event "
-               "or future (anevent or future may have been consumed more than "
+               "or future (an event or future may have been consumed more than "
                "once)";
       }
       case event_errc::no_content:
@@ -93,21 +92,24 @@ struct event_error_category : error_category
 
     return system_category().default_error_condition(ev);
   }
-}; 
+};
 
 /// Obtains a reference to the static error category object for the errors
 /// related to futures and promises. The object is required to override the
-/// virtual function error_category::name() to return a pointer to the string 
-/// "event". It is used to identify error codes provided in the 
-/// exceptions of type event_error. 
+/// virtual function error_category::name() to return a pointer to the string
+/// "event". It is used to identify error codes provided in the
+/// exceptions of type event_error.
 inline error_category const& event_category()
 {
   static const event_error_category result;
   return result;
 }
 
+namespace system
+{
 /// Specialization of \p is_error_code_enum for \p event_errc.
 template<> struct is_error_code_enum<event_errc> : true_type {};
+} // end system
 
 /// \return <tt>error_code(static_cast<int>(e), event_category())</tt>
 inline error_code make_error_code(event_errc e)
@@ -119,7 +121,7 @@ inline error_code make_error_code(event_errc e)
 inline error_condition make_error_condition(event_errc e)
 {
   return error_condition(static_cast<int>(e), event_category());
-} 
+}
 
 struct event_error : std::logic_error
 {
@@ -156,7 +158,7 @@ inline bool operator<(event_error const& lhs, event_error const& rhs) noexcept
   return lhs.code() < rhs.code();
 }
 
-THRUST_END_NS
+THRUST_NAMESPACE_END
 
-#endif
+#endif // C++14
 
diff --git a/thrust/detail/execute_with_allocator.h b/thrust/detail/execute_with_allocator.h
index 0b92d12b3..430fe739c 100644
--- a/thrust/detail/execute_with_allocator.h
+++ b/thrust/detail/execute_with_allocator.h
@@ -25,8 +25,8 @@
 #include <thrust/detail/allocator/allocator_traits.h>
 #include <thrust/detail/integer_math.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
+
 namespace detail
 {
 
@@ -68,17 +68,23 @@ void
 return_temporary_buffer(
     thrust::detail::execute_with_allocator<Allocator, BaseSystem>& system
   , Pointer p
+  , std::ptrdiff_t n
     )
 {
   typedef typename thrust::detail::remove_reference<Allocator>::type naked_allocator;
   typedef typename thrust::detail::allocator_traits<naked_allocator> alloc_traits;
   typedef typename alloc_traits::pointer                             pointer;
+  typedef typename alloc_traits::size_type                           size_type;
+  typedef typename alloc_traits::value_type                          value_type;
+  typedef typename thrust::detail::pointer_traits<Pointer>::element_type T;
+
+  size_type num_elements = divide_ri(sizeof(T) * n, sizeof(value_type));
 
   pointer to_ptr = thrust::reinterpret_pointer_cast<pointer>(p);
-  alloc_traits::deallocate(system.get_allocator(), to_ptr, 0);
+  alloc_traits::deallocate(system.get_allocator(), to_ptr, num_elements);
 }
 
-#if __cplusplus >= 201103L
+#if THRUST_CPP_DIALECT >= 2011
 
 template <
     typename T,
@@ -119,18 +125,25 @@ __host__
 void
 return_temporary_buffer(
     thrust::detail::execute_with_allocator_and_dependencies<Allocator, BaseSystem, Dependencies...>& system,
-    Pointer p
+    Pointer p,
+    std::ptrdiff_t n
     )
 {
   typedef typename thrust::detail::remove_reference<Allocator>::type naked_allocator;
   typedef typename thrust::detail::allocator_traits<naked_allocator> alloc_traits;
   typedef typename alloc_traits::pointer                             pointer;
+  typedef typename alloc_traits::size_type                           size_type;
+  typedef typename alloc_traits::value_type                          value_type;
+  typedef typename thrust::detail::pointer_traits<Pointer>::element_type T;
+
+  size_type num_elements = divide_ri(sizeof(T) * n, sizeof(value_type));
 
   pointer to_ptr = thrust::reinterpret_pointer_cast<pointer>(p);
-  alloc_traits::deallocate(system.get_allocator(), to_ptr, 0);
+  alloc_traits::deallocate(system.get_allocator(), to_ptr, num_elements);
 }
 
 #endif
 
-}} // namespace thrust::detail
+} // namespace detail
 
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/execute_with_allocator_fwd.h b/thrust/detail/execute_with_allocator_fwd.h
index 22d78fdd6..1d5899a7d 100644
--- a/thrust/detail/execute_with_allocator_fwd.h
+++ b/thrust/detail/execute_with_allocator_fwd.h
@@ -24,8 +24,8 @@
   #include <thrust/detail/execute_with_dependencies.h>
 #endif
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
+
 namespace detail
 {
 
@@ -101,4 +101,6 @@ struct execute_with_allocator
 #endif
 };
 
-}} // namespace thrust::detail
+} // namespace detail
+
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/execute_with_dependencies.h b/thrust/detail/execute_with_dependencies.h
index 01fb82364..ec54010b0 100644
--- a/thrust/detail/execute_with_dependencies.h
+++ b/thrust/detail/execute_with_dependencies.h
@@ -27,8 +27,8 @@
 #include <tuple>
 #include <type_traits>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
+
 namespace detail
 {
 
@@ -189,8 +189,8 @@ struct execute_with_allocator_and_dependencies
         return std::move(dependencies);
     }
 
-    typename std::remove_reference<Allocator>::type&
     __host__
+    typename std::add_lvalue_reference<Allocator>::type
     get_allocator()
     {
         return alloc;
@@ -261,7 +261,7 @@ extract_dependencies(System &&)
 }
 
 } // end detail
-} // end thrust
 
-#endif // THRUST_CPP_DIALECT >= 2011
+THRUST_NAMESPACE_END
 
+#endif // THRUST_CPP_DIALECT >= 2011
diff --git a/thrust/detail/execution_policy.h b/thrust/detail/execution_policy.h
index ec554b689..dcc11a770 100644
--- a/thrust/detail/execution_policy.h
+++ b/thrust/detail/execution_policy.h
@@ -18,8 +18,8 @@
 
 #include <thrust/detail/config.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
+
 namespace detail
 {
 
@@ -44,7 +44,7 @@ struct execution_policy_base : execution_policy_marker {};
 
 
 template<typename DerivedPolicy>
-THRUST_CONSTEXPR __host__ __device__
+constexpr __host__ __device__
 execution_policy_base<DerivedPolicy> &strip_const(const execution_policy_base<DerivedPolicy> &x)
 {
   return const_cast<execution_policy_base<DerivedPolicy>&>(x);
@@ -52,7 +52,7 @@ execution_policy_base<DerivedPolicy> &strip_const(const execution_policy_base<De
 
 
 template<typename DerivedPolicy>
-THRUST_CONSTEXPR __host__ __device__
+constexpr __host__ __device__
 DerivedPolicy &derived_cast(execution_policy_base<DerivedPolicy> &x)
 {
   return static_cast<DerivedPolicy&>(x);
@@ -60,7 +60,7 @@ DerivedPolicy &derived_cast(execution_policy_base<DerivedPolicy> &x)
 
 
 template<typename DerivedPolicy>
-THRUST_CONSTEXPR __host__ __device__
+constexpr __host__ __device__
 const DerivedPolicy &derived_cast(const execution_policy_base<DerivedPolicy> &x)
 {
   return static_cast<const DerivedPolicy&>(x);
@@ -73,5 +73,4 @@ template<typename DerivedPolicy>
     : thrust::detail::execution_policy_base<DerivedPolicy>
 {};
 
-} // end thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/extrema.inl b/thrust/detail/extrema.inl
index 3f60743e6..2c1750e7d 100644
--- a/thrust/detail/extrema.inl
+++ b/thrust/detail/extrema.inl
@@ -14,6 +14,7 @@
  *  limitations under the License.
  */
 
+#pragma once
 
 #include <thrust/detail/config.h>
 #include <thrust/extrema.h>
@@ -22,9 +23,7 @@
 #include <thrust/system/detail/generic/extrema.h>
 #include <thrust/system/detail/adl/extrema.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 __thrust_exec_check_disable__
 template<typename DerivedPolicy, typename ForwardIterator>
@@ -141,7 +140,7 @@ ForwardIterator max_element(ForwardIterator first, ForwardIterator last,
 
 
 template <typename ForwardIterator>
-thrust::pair<ForwardIterator,ForwardIterator> 
+thrust::pair<ForwardIterator,ForwardIterator>
 minmax_element(ForwardIterator first, ForwardIterator last)
 {
   using thrust::system::detail::generic::select_system;
@@ -155,7 +154,7 @@ minmax_element(ForwardIterator first, ForwardIterator last)
 
 
 template <typename ForwardIterator, typename BinaryPredicate>
-thrust::pair<ForwardIterator,ForwardIterator> 
+thrust::pair<ForwardIterator,ForwardIterator>
 minmax_element(ForwardIterator first, ForwardIterator last, BinaryPredicate comp)
 {
   using thrust::system::detail::generic::select_system;
@@ -167,6 +166,4 @@ minmax_element(ForwardIterator first, ForwardIterator last, BinaryPredicate comp
   return thrust::minmax_element(select_system(system), first, last, comp);
 } // end minmax_element()
 
-
-} // end namespace thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/fill.inl b/thrust/detail/fill.inl
index 6e957ca1f..e68672bbe 100644
--- a/thrust/detail/fill.inl
+++ b/thrust/detail/fill.inl
@@ -14,10 +14,9 @@
  *  limitations under the License.
  */
 
+#pragma once
 
-/*! \file fill.inl
- *  \brief Inline file for fill.h.
- */
+#include <thrust/detail/config.h>
 
 #include <thrust/fill.h>
 #include <thrust/iterator/iterator_traits.h>
@@ -25,9 +24,7 @@
 #include <thrust/system/detail/generic/fill.h>
 #include <thrust/system/detail/adl/fill.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 __thrust_exec_check_disable__
 template<typename DerivedPolicy, typename ForwardIterator, typename T>
@@ -86,6 +83,4 @@ __host__ __device__
   return thrust::fill_n(select_system(system), first, n, value);
 } // end fill()
 
-
-} // end namespace thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/find.inl b/thrust/detail/find.inl
index f42ff4650..5b494f61a 100644
--- a/thrust/detail/find.inl
+++ b/thrust/detail/find.inl
@@ -14,10 +14,7 @@
  *  limitations under the License.
  */
 
-
-/*! \file find.inl
- *  \brief Inline file for find.h
- */
+#pragma once
 
 #include <thrust/detail/config.h>
 #include <thrust/iterator/iterator_traits.h>
@@ -25,9 +22,7 @@
 #include <thrust/system/detail/generic/find.h>
 #include <thrust/system/detail/adl/find.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 __thrust_exec_check_disable__
 template<typename DerivedPolicy, typename InputIterator, typename T>
@@ -74,11 +69,11 @@ InputIterator find(InputIterator first,
                    const T& value)
 {
   using thrust::system::detail::generic::select_system;
-  
+
   typedef typename thrust::iterator_system<InputIterator>::type System;
-  
+
   System system;
-  
+
   return thrust::find(select_system(system), first, last, value);
 }
 
@@ -88,11 +83,11 @@ InputIterator find_if(InputIterator first,
                       Predicate pred)
 {
   using thrust::system::detail::generic::select_system;
-  
+
   typedef typename thrust::iterator_system<InputIterator>::type System;
-  
+
   System system;
-  
+
   return thrust::find_if(select_system(system), first, last, pred);
 }
 
@@ -102,14 +97,12 @@ InputIterator find_if_not(InputIterator first,
                           Predicate pred)
 {
   using thrust::system::detail::generic::select_system;
-  
+
   typedef typename thrust::iterator_system<InputIterator>::type System;
-  
+
   System system;
-  
+
   return thrust::find_if_not(select_system(system), first, last, pred);
 }
 
-
-} // end namespace thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/for_each.inl b/thrust/detail/for_each.inl
index 3365ce2e0..4ba39c71a 100644
--- a/thrust/detail/for_each.inl
+++ b/thrust/detail/for_each.inl
@@ -14,10 +14,7 @@
  *  limitations under the License.
  */
 
-
-/*! \file for_each.inl
- *  \brief Inline file for for_each.h.
- */
+#pragma once
 
 #include <thrust/detail/config.h>
 #include <thrust/for_each.h>
@@ -26,10 +23,9 @@
 #include <thrust/system/detail/generic/for_each.h>
 #include <thrust/system/detail/adl/for_each.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
-__thrust_exec_check_disable__ 
+__thrust_exec_check_disable__
 template<typename DerivedPolicy,
          typename InputIterator,
          typename UnaryFunction>
@@ -58,7 +54,7 @@ InputIterator for_each(InputIterator first,
   return thrust::for_each(select_system(system), first, last, f);
 } // end for_each()
 
-__thrust_exec_check_disable__ 
+__thrust_exec_check_disable__
 template<typename DerivedPolicy, typename InputIterator, typename Size, typename UnaryFunction>
 __host__ __device__
   InputIterator for_each_n(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
@@ -87,6 +83,4 @@ InputIterator for_each_n(InputIterator first,
   return thrust::for_each_n(select_system(system), first, n, f);
 } // end for_each_n()
 
-
-} // end namespace thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/function.h b/thrust/detail/function.h
index f1f9e9c94..ba20507a5 100644
--- a/thrust/detail/function.h
+++ b/thrust/detail/function.h
@@ -19,85 +19,143 @@
 #include <thrust/detail/config.h>
 #include <thrust/detail/raw_reference_cast.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
+
 namespace detail
 {
 
-
-template<typename Function, typename Result>
-  struct wrapped_function
+template <typename Function, typename Result>
+struct wrapped_function
 {
   // mutable because Function::operator() might be const
   mutable Function m_f;
 
   inline __host__ __device__
   wrapped_function()
-    : m_f()
+      : m_f()
   {}
 
   inline __host__ __device__
-  wrapped_function(const Function &f)
-    : m_f(f)
+  wrapped_function(const Function& f)
+      : m_f(f)
   {}
 
   __thrust_exec_check_disable__
-  template<typename Argument>
-  inline __host__ __device__
-    Result operator()(Argument &x) const
+  template <typename Argument>
+  __thrust_forceinline__ __host__ __device__
+  Result operator()(Argument& x) const
   {
-    // we static cast to Result to handle void Result without error
-    // in case Function's result is non-void
     return static_cast<Result>(m_f(thrust::raw_reference_cast(x)));
   }
 
   __thrust_exec_check_disable__
-  template<typename Argument>
-    inline __host__ __device__ Result operator()(const Argument &x) const
+  template <typename Argument>
+  __thrust_forceinline__ __host__ __device__
+  Result operator()(const Argument& x) const
   {
-    // we static cast to Result to handle void Result without error
-    // in case Function's result is non-void
     return static_cast<Result>(m_f(thrust::raw_reference_cast(x)));
   }
 
   __thrust_exec_check_disable__
-  template<typename Argument1, typename Argument2>
-    inline __host__ __device__ Result operator()(Argument1 &x, Argument2 &y) const
+  template <typename Argument1, typename Argument2>
+  __thrust_forceinline__ __host__ __device__
+  Result operator()(Argument1& x, Argument2& y) const
   {
-    // we static cast to Result to handle void Result without error
-    // in case Function's result is non-void
-    return static_cast<Result>(m_f(thrust::raw_reference_cast(x), thrust::raw_reference_cast(y)));
+    return static_cast<Result>(m_f(thrust::raw_reference_cast(x),
+                                   thrust::raw_reference_cast(y)));
   }
 
   __thrust_exec_check_disable__
-  template<typename Argument1, typename Argument2>
-    inline __host__ __device__ Result operator()(const Argument1 &x, Argument2 &y) const
+  template <typename Argument1, typename Argument2>
+  __thrust_forceinline__ __host__ __device__
+  Result operator()(const Argument1& x, Argument2& y) const
   {
-    // we static cast to Result to handle void Result without error
-    // in case Function's result is non-void
-    return static_cast<Result>(m_f(thrust::raw_reference_cast(x), thrust::raw_reference_cast(y)));
+    return static_cast<Result>(m_f(thrust::raw_reference_cast(x),
+                                   thrust::raw_reference_cast(y)));
   }
 
   __thrust_exec_check_disable__
-  template<typename Argument1, typename Argument2>
-    inline __host__ __device__ Result operator()(const Argument1 &x, const Argument2 &y) const
+  template <typename Argument1, typename Argument2>
+  __thrust_forceinline__ __host__ __device__
+  Result operator()(const Argument1& x, const Argument2& y) const
   {
-    // we static cast to Result to handle void Result without error
-    // in case Function's result is non-void
-    return static_cast<Result>(m_f(thrust::raw_reference_cast(x), thrust::raw_reference_cast(y)));
+    return static_cast<Result>(m_f(thrust::raw_reference_cast(x),
+                                   thrust::raw_reference_cast(y)));
   }
 
   __thrust_exec_check_disable__
-  template<typename Argument1, typename Argument2>
-    inline __host__ __device__ Result operator()(Argument1 &x, const Argument2 &y) const
+  template <typename Argument1, typename Argument2>
+  __thrust_forceinline__ __host__ __device__
+  Result operator()(Argument1& x, const Argument2& y) const
   {
-    // we static cast to Result to handle void Result without error
-    // in case Function's result is non-void
-    return static_cast<Result>(m_f(thrust::raw_reference_cast(x), thrust::raw_reference_cast(y)));
+    return static_cast<Result>(m_f(thrust::raw_reference_cast(x),
+                                   thrust::raw_reference_cast(y)));
   }
 }; // end wrapped_function
 
+// Specialize for void return types:
+template <typename Function>
+struct wrapped_function<Function, void>
+{
+  // mutable because Function::operator() might be const
+  mutable Function m_f;
+  inline __host__ __device__
+  wrapped_function()
+    : m_f()
+  {}
+
+  inline __host__ __device__
+  wrapped_function(const Function& f)
+    : m_f(f)
+  {}
+
+  __thrust_exec_check_disable__
+  template <typename Argument>
+  __thrust_forceinline__ __host__ __device__
+  void operator()(Argument& x) const
+  {
+    m_f(thrust::raw_reference_cast(x));
+  }
+
+  __thrust_exec_check_disable__
+  template <typename Argument>
+  __thrust_forceinline__ __host__ __device__
+  void operator()(const Argument& x) const
+  {
+    m_f(thrust::raw_reference_cast(x));
+  }
+
+  __thrust_exec_check_disable__
+  template <typename Argument1, typename Argument2>
+  __thrust_forceinline__ __host__ __device__
+  void operator()(Argument1& x, Argument2& y) const
+  {
+    m_f(thrust::raw_reference_cast(x), thrust::raw_reference_cast(y));
+  }
+
+  __thrust_exec_check_disable__
+  template <typename Argument1, typename Argument2>
+  __thrust_forceinline__ __host__ __device__
+  void operator()(const Argument1& x, Argument2& y) const
+  {
+    m_f(thrust::raw_reference_cast(x), thrust::raw_reference_cast(y));
+  }
+  __thrust_exec_check_disable__
+  template <typename Argument1, typename Argument2>
+  __thrust_forceinline__ __host__ __device__
+  void operator()(const Argument1& x, const Argument2& y) const
+  {
+    m_f(thrust::raw_reference_cast(x), thrust::raw_reference_cast(y));
+  }
+  __thrust_exec_check_disable__
+  template <typename Argument1, typename Argument2>
+  __thrust_forceinline__ __host__ __device__
+  void operator()(Argument1& x, const Argument2& y) const
+  {
+    m_f(thrust::raw_reference_cast(x), thrust::raw_reference_cast(y));
+  }
+}; // end wrapped_function
 
-} // end detail
-} // end thrust
+} // namespace detail
 
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/functional.inl b/thrust/detail/functional.inl
index ea1322797..bdf8e0415 100644
--- a/thrust/detail/functional.inl
+++ b/thrust/detail/functional.inl
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2021 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -14,10 +14,13 @@
  *  limitations under the License.
  */
 
+#pragma once
+
+#include <thrust/detail/config.h>
+
 #include <thrust/functional.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace detail
 {
@@ -120,5 +123,4 @@ template<typename BinaryPredicate>
   return binary_negate<BinaryPredicate>(pred);
 } // end not2()
 
-} // end thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/functional/actor.h b/thrust/detail/functional/actor.h
index 666de09ee..cee0770a4 100644
--- a/thrust/detail/functional/actor.h
+++ b/thrust/detail/functional/actor.h
@@ -30,15 +30,23 @@
 #include <thrust/detail/functional/value.h>
 #include <thrust/detail/functional/composite.h>
 #include <thrust/detail/functional/operators/assignment_operator.h>
+#include <thrust/detail/raw_reference_cast.h>
 #include <thrust/detail/type_traits/result_of_adaptable_function.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 namespace functional
 {
 
+// eval_ref<T> is
+// - T when T is a subclass of thrust::reference
+// - T& otherwise
+// This is used to let thrust::references pass through actor evaluations.
+template <typename T>
+using eval_ref = typename std::conditional<
+  thrust::detail::is_wrapped_reference<T>::value, T, T&>::type;
+
 template<typename Action, typename Env>
   struct apply_actor
 {
@@ -52,7 +60,7 @@ template<typename Eval>
   typedef Eval eval_type;
 
   __host__ __device__
-  actor(void);
+  constexpr actor();
 
   __host__ __device__
   actor(const Eval &base);
@@ -61,55 +69,10 @@ template<typename Eval>
   typename apply_actor<eval_type, thrust::null_type >::type
   operator()(void) const;
 
-  template<typename T0>
-  __host__ __device__
-  typename apply_actor<eval_type, thrust::tuple<T0&> >::type
-  operator()(T0 &_0) const;
-
-  template<typename T0, typename T1>
-  __host__ __device__
-  typename apply_actor<eval_type, thrust::tuple<T0&,T1&> >::type
-  operator()(T0 &_0, T1 &_1) const;
-
-  template<typename T0, typename T1, typename T2>
-  __host__ __device__
-  typename apply_actor<eval_type, thrust::tuple<T0&,T1&,T2&> >::type
-  operator()(T0 &_0, T1 &_1, T2 &_2) const;
-
-  template<typename T0, typename T1, typename T2, typename T3>
-  __host__ __device__
-  typename apply_actor<eval_type, thrust::tuple<T0&,T1&,T2&,T3&> >::type
-  operator()(T0 &_0, T1 &_1, T2 &_2, T3 &_3) const;
-
-  template<typename T0, typename T1, typename T2, typename T3, typename T4>
-  __host__ __device__
-  typename apply_actor<eval_type, thrust::tuple<T0&,T1&,T2&,T3&,T4&> >::type
-  operator()(T0 &_0, T1 &_1, T2 &_2, T3 &_3, T4 &_4) const;
-
-  template<typename T0, typename T1, typename T2, typename T3, typename T4, typename T5>
-  __host__ __device__
-  typename apply_actor<eval_type, thrust::tuple<T0&,T1&,T2&,T3&,T4&,T5&> >::type
-  operator()(T0 &_0, T1 &_1, T2 &_2, T3 &_3, T4 &_4, T5 &_5) const;
-
-  template<typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6>
-  __host__ __device__
-  typename apply_actor<eval_type, thrust::tuple<T0&,T1&,T2&,T3&,T4&,T5&,T6&> >::type
-  operator()(T0 &_0, T1 &_1, T2 &_2, T3 &_3, T4 &_4, T5 &_5, T6 &_6) const;
-
-  template<typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7>
-  __host__ __device__
-  typename apply_actor<eval_type, thrust::tuple<T0&,T1&,T2&,T3&,T4&,T5&,T6&,T7&> >::type
-  operator()(T0 &_0, T1 &_1, T2 &_2, T3 &_3, T4 &_4, T5 &_5, T6 &_6, T7 &_7) const;
-
-  template<typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8>
-  __host__ __device__
-  typename apply_actor<eval_type, thrust::tuple<T0&,T1&,T2&,T3&,T4&,T5&,T6&,T7&,T8&> >::type
-  operator()(T0 &_0, T1 &_1, T2 &_2, T3 &_3, T4 &_4, T5 &_5, T6 &_6, T7 &_7, T8 &_8) const;
-
-  template<typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8, typename T9>
+  template <typename... Ts>
   __host__ __device__
-  typename apply_actor<eval_type, thrust::tuple<T0&,T1&,T2&,T3&,T4&,T5&,T6&,T7&,T8&,T9&> >::type
-  operator()(T0 &_0, T1 &_1, T2 &_2, T3 &_3, T4 &_4, T5 &_5, T6 &_6, T7 &_7, T8 &_8, T9 &_9) const;
+  typename apply_actor<eval_type, thrust::tuple<eval_ref<Ts>...>>::type
+  operator()(Ts&&... ts) const;
 
   template<typename T>
   __host__ __device__
@@ -186,7 +149,7 @@ template<typename Eval, typename Arg1, typename Arg2>
 }; // end result_of
 
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/functional/actor.inl>
 
diff --git a/thrust/detail/functional/actor.inl b/thrust/detail/functional/actor.inl
index e09dd4800..e0bdebbbf 100644
--- a/thrust/detail/functional/actor.inl
+++ b/thrust/detail/functional/actor.inl
@@ -23,13 +23,17 @@
 // Based on Boost.Phoenix v1.2
 // Copyright (c) 2001-2002 Joel de Guzman
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/detail/functional/composite.h>
 #include <thrust/detail/functional/operators/assignment_operator.h>
 #include <thrust/functional.h>
+#include <thrust/type_traits/logical_metafunctions.h>
 
-namespace thrust
-{
+#include <type_traits>
+
+THRUST_NAMESPACE_BEGIN
 
 namespace detail
 {
@@ -38,8 +42,8 @@ namespace functional
 
 template<typename Eval>
   __host__ __device__
-  actor<Eval>
-    ::actor(void)
+  constexpr actor<Eval>
+    ::actor()
       : eval_type()
 {}
 
@@ -62,135 +66,38 @@ template<typename Eval>
   return eval_type::eval(thrust::null_type());
 } // end basic_environment::operator()
 
-template<typename Eval>
-  template<typename T0>
-    __host__ __device__
-    typename apply_actor<
-      typename actor<Eval>::eval_type,
-      typename thrust::tuple<T0&>
-    >::type
-      actor<Eval>
-        ::operator()(T0 &_0) const
-{
-  return eval_type::eval(thrust::tie(_0));
-} // end basic_environment::operator()
-
-template<typename Eval>
-  template<typename T0, typename T1>
-    __host__ __device__
-    typename apply_actor<
-      typename actor<Eval>::eval_type,
-      typename thrust::tuple<T0&,T1&>
-    >::type
-      actor<Eval>
-        ::operator()(T0 &_0, T1 &_1) const
-{
-  return eval_type::eval(thrust::tie(_0,_1));
-} // end basic_environment::operator()
-
-template<typename Eval>
-  template<typename T0, typename T1, typename T2>
-    __host__ __device__
-    typename apply_actor<
-      typename actor<Eval>::eval_type,
-      typename thrust::tuple<T0&,T1&,T2&>
-    >::type
-      actor<Eval>
-        ::operator()(T0 &_0, T1 &_1, T2 &_2) const
-{
-  return eval_type::eval(thrust::tie(_0,_1,_2));
-} // end basic_environment::operator()
-
-template<typename Eval>
-  template<typename T0, typename T1, typename T2, typename T3>
-    __host__ __device__
-    typename apply_actor<
-      typename actor<Eval>::eval_type,
-      typename thrust::tuple<T0&,T1&,T2&,T3&>
-    >::type
-      actor<Eval>
-        ::operator()(T0 &_0, T1 &_1, T2 &_2, T3 &_3) const
-{
-  return eval_type::eval(thrust::tie(_0,_1,_2,_3));
-} // end basic_environment::operator()
-
-template<typename Eval>
-  template<typename T0, typename T1, typename T2, typename T3, typename T4>
-    __host__ __device__
-    typename apply_actor<
-      typename actor<Eval>::eval_type,
-      typename thrust::tuple<T0&,T1&,T2&,T3&,T4&>
-    >::type
-      actor<Eval>
-        ::operator()(T0 &_0, T1 &_1, T2 &_2, T3 &_3, T4 &_4) const
-{
-  return eval_type::eval(thrust::tie(_0,_1,_2,_3,_4));
-} // end basic_environment::operator()
-
-template<typename Eval>
-  template<typename T0, typename T1, typename T2, typename T3, typename T4, typename T5>
-    __host__ __device__
-    typename apply_actor<
-      typename actor<Eval>::eval_type,
-      typename thrust::tuple<T0&,T1&,T2&,T3&,T4&,T5&>
-    >::type
-      actor<Eval>
-        ::operator()(T0 &_0, T1 &_1, T2 &_2, T3 &_3, T4 &_4, T5 &_5) const
-{
-  return eval_type::eval(thrust::tie(_0,_1,_2,_3,_4,_5));
-} // end basic_environment::operator()
-
-template<typename Eval>
-  template<typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6>
-    __host__ __device__
-    typename apply_actor<
-      typename actor<Eval>::eval_type,
-      typename thrust::tuple<T0&,T1&,T2&,T3&,T4&,T5&,T6&>
-    >::type
-      actor<Eval>
-        ::operator()(T0 &_0, T1 &_1, T2 &_2, T3 &_3, T4 &_4, T5 &_5, T6 &_6) const
-{
-  return eval_type::eval(thrust::tie(_0,_1,_2,_3,_4,_5,_6));
-} // end basic_environment::operator()
-
-template<typename Eval>
-  template<typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7>
-    __host__ __device__
-    typename apply_actor<
-      typename actor<Eval>::eval_type,
-      typename thrust::tuple<T0&,T1&,T2&,T3&,T4&,T5&,T6&,T7&>
-    >::type
-      actor<Eval>
-        ::operator()(T0 &_0, T1 &_1, T2 &_2, T3 &_3, T4 &_4, T5 &_5, T6 &_6, T7 &_7) const
-{
-  return eval_type::eval(thrust::tie(_0,_1,_2,_3,_4,_5,_6,_7));
-} // end basic_environment::operator()
+// actor::operator() needs to construct a tuple of references to its
+// arguments. To make this work with thrust::reference<T>, we need to
+// detect thrust proxy references and store them as T rather than T&.
+// This check ensures that the forwarding references passed into
+// actor::operator() are either:
+// - T&& if and only if T is a thrust::reference<U>, or
+// - T& for any other types.
+// This struct provides a nicer diagnostic for when these conditions aren't
+// met.
+template <typename T>
+using actor_check_ref_type =
+  thrust::detail::integral_constant<bool,
+    ( std::is_lvalue_reference<T>::value ||
+      thrust::detail::is_wrapped_reference<T>::value )>;
+
+template <typename... Ts>
+using actor_check_ref_types =
+  thrust::conjunction<actor_check_ref_type<Ts>...>;
 
 template<typename Eval>
-  template<typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8>
-    __host__ __device__
-    typename apply_actor<
-      typename actor<Eval>::eval_type,
-      typename thrust::tuple<T0&,T1&,T2&,T3&,T4&,T5&,T6&,T7&,T8&>
-    >::type
-      actor<Eval>
-        ::operator()(T0 &_0, T1 &_1, T2 &_2, T3 &_3, T4 &_4, T5 &_5, T6 &_6, T7 &_7, T8 &_8) const
+template<typename... Ts>
+__host__ __device__
+typename apply_actor<typename actor<Eval>::eval_type,
+                     thrust::tuple<eval_ref<Ts>...>>::type
+actor<Eval>::operator()(Ts&&... ts) const
 {
-  return eval_type::eval(thrust::tie(_0,_1,_2,_3,_4,_5,_6,_7,_8));
-} // end basic_environment::operator()
-
-template<typename Eval>
-  template<typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8, typename T9>
-    __host__ __device__
-    typename apply_actor<
-      typename actor<Eval>::eval_type,
-      typename thrust::tuple<T0&,T1&,T2&,T3&,T4&,T5&,T6&,T7&,T8&,T9&>
-    >::type
-      actor<Eval>
-        ::operator()(T0 &_0, T1 &_1, T2 &_2, T3 &_3, T4 &_4, T5 &_5, T6 &_6, T7 &_7, T8 &_8, T9 &_9) const
-{
-  return eval_type::eval(thrust::tie(_0,_1,_2,_3,_4,_5,_6,_7,_8,_9));
-} // end basic_environment::operator()
+  static_assert(actor_check_ref_types<Ts...>::value,
+                "Actor evaluations only support rvalue references to "
+                "thrust::reference subclasses.");
+  using tuple_type = thrust::tuple<eval_ref<Ts>...>;
+  return eval_type::eval(tuple_type(THRUST_FWD(ts)...));
+} // end actor<Eval>::operator()
 
 template<typename Eval>
   template<typename T>
@@ -204,4 +111,4 @@ template<typename Eval>
 
 } // end functional
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/functional/argument.h b/thrust/detail/functional/argument.h
index 88b48a6d2..aac29f537 100644
--- a/thrust/detail/functional/argument.h
+++ b/thrust/detail/functional/argument.h
@@ -28,8 +28,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/tuple.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 namespace functional
@@ -59,7 +58,7 @@ template<unsigned int i>
     };
 
     __host__ __device__
-    argument(void){}
+    constexpr argument(){}
 
     template<typename Env>
     __host__ __device__
@@ -71,5 +70,5 @@ template<unsigned int i>
 
 } // end functional
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/functional/composite.h b/thrust/detail/functional/composite.h
index 6cf095bf1..41ee74739 100644
--- a/thrust/detail/functional/composite.h
+++ b/thrust/detail/functional/composite.h
@@ -25,11 +25,12 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/detail/functional/actor.h>
 #include <thrust/tuple.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 namespace functional
@@ -159,5 +160,5 @@ __host__ __device__
 
 } // end functional
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/functional/operators/arithmetic_operators.h b/thrust/detail/functional/operators/arithmetic_operators.h
index 6628917d6..443d307cb 100644
--- a/thrust/detail/functional/operators/arithmetic_operators.h
+++ b/thrust/detail/functional/operators/arithmetic_operators.h
@@ -22,8 +22,7 @@
 #include <thrust/detail/functional/operators/operator_adaptors.h>
 #include <thrust/functional.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 namespace functional
@@ -33,49 +32,57 @@ template<typename Eval>
 __host__ __device__
 actor<
   composite<
-    unary_operator<thrust::negate>,
+    transparent_unary_operator<thrust::negate<>>,
     actor<Eval>
   >
 >
 __host__ __device__
 operator-(const actor<Eval> &_1)
 {
-  return compose(unary_operator<thrust::negate>(), _1);
+  return compose(transparent_unary_operator<thrust::negate<>>(), _1);
 } // end operator-()
 
 // there's no standard unary_plus functional, so roll an ad hoc one here
-template<typename T>
-  struct unary_plus
-    : public thrust::unary_function<T,T>
+struct unary_plus
 {
-  __host__ __device__ T operator()(const T &x) const {return +x;}
-}; // end unary_plus
+  using is_transparent = void;
+
+  __thrust_exec_check_disable__
+  template <typename T1>
+  __host__ __device__
+  constexpr auto operator()(T1&& t1) const
+  noexcept(noexcept(+THRUST_FWD(t1)))
+  THRUST_TRAILING_RETURN(decltype(+THRUST_FWD(t1)))
+  {
+    return +THRUST_FWD(t1);
+  }
+};
 
 template<typename Eval>
 __host__ __device__
 actor<
   composite<
-    unary_operator<unary_plus>,
+    transparent_unary_operator<unary_plus>,
     actor<Eval>
   >
 >
 operator+(const actor<Eval> &_1)
 {
-  return compose(unary_operator<unary_plus>(), _1);
+  return compose(transparent_unary_operator<unary_plus>(), _1);
 } // end operator+()
 
 template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::plus>,
+    transparent_binary_operator<thrust::plus<>>,
     actor<T1>,
     typename as_actor<T2>::type
   >
 >
 operator+(const actor<T1> &_1, const T2 &_2)
 {
-  return compose(binary_operator<thrust::plus>(),
+  return compose(transparent_binary_operator<thrust::plus<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator+()
@@ -84,14 +91,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::plus>,
+    transparent_binary_operator<thrust::plus<>>,
     typename as_actor<T1>::type,
     actor<T2>
   >
 >
 operator+(const T1 &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<thrust::plus>(),
+  return compose(transparent_binary_operator<thrust::plus<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator+()
@@ -100,14 +107,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::plus>,
+    transparent_binary_operator<thrust::plus<>>,
     actor<T1>,
     actor<T2>
   >
 >
 operator+(const actor<T1> &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<thrust::plus>(),
+  return compose(transparent_binary_operator<thrust::plus<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator+()
@@ -116,14 +123,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::minus>,
+    transparent_binary_operator<thrust::minus<>>,
     typename as_actor<T1>::type,
     actor<T2>
   >
 >
 operator-(const T1 &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<thrust::minus>(),
+  return compose(transparent_binary_operator<thrust::minus<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator-()
@@ -132,14 +139,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::minus>,
+    transparent_binary_operator<thrust::minus<>>,
     actor<T1>,
     typename as_actor<T2>::type
   >
 >
 operator-(const actor<T1> &_1, const T2 &_2)
 {
-  return compose(binary_operator<thrust::minus>(),
+  return compose(transparent_binary_operator<thrust::minus<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator-()
@@ -148,14 +155,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::minus>,
+    transparent_binary_operator<thrust::minus<>>,
     actor<T1>,
     actor<T2>
   >
 >
 operator-(const actor<T1> &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<thrust::minus>(),
+  return compose(transparent_binary_operator<thrust::minus<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator-()
@@ -164,14 +171,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::multiplies>,
+    transparent_binary_operator<thrust::multiplies<>>,
     typename as_actor<T1>::type,
     actor<T2>
   >
 >
 operator*(const T1 &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<thrust::multiplies>(),
+  return compose(transparent_binary_operator<thrust::multiplies<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator*()
@@ -180,14 +187,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::multiplies>,
+    transparent_binary_operator<thrust::multiplies<>>,
     actor<T1>,
     typename as_actor<T2>::type
   >
 >
 operator*(const actor<T1> &_1, const T2 &_2)
 {
-  return compose(binary_operator<thrust::multiplies>(),
+  return compose(transparent_binary_operator<thrust::multiplies<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator*()
@@ -196,14 +203,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::multiplies>,
+    transparent_binary_operator<thrust::multiplies<>>,
     actor<T1>,
     actor<T2>
   >
 >
 operator*(const actor<T1> &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<thrust::multiplies>(),
+  return compose(transparent_binary_operator<thrust::multiplies<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator*()
@@ -212,14 +219,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::divides>,
+    transparent_binary_operator<thrust::divides<>>,
     actor<T1>,
     typename as_actor<T2>::type
   >
 >
 operator/(const actor<T1> &_1, const T2 &_2)
 {
-  return compose(binary_operator<thrust::divides>(),
+  return compose(transparent_binary_operator<thrust::divides<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator/()
@@ -228,14 +235,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::divides>,
+    transparent_binary_operator<thrust::divides<>>,
     typename as_actor<T1>::type,
     actor<T2>
   >
 >
 operator/(const T1 &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<thrust::divides>(),
+  return compose(transparent_binary_operator<thrust::divides<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator/()
@@ -244,14 +251,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::divides>,
+    transparent_binary_operator<thrust::divides<>>,
     actor<T1>,
     actor<T2>
   >
 >
 operator/(const actor<T1> &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<thrust::divides>(),
+  return compose(transparent_binary_operator<thrust::divides<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator/()
@@ -260,14 +267,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::modulus>,
+    transparent_binary_operator<thrust::modulus<>>,
     actor<T1>,
     typename as_actor<T2>::type
   >
 >
 operator%(const actor<T1> &_1, const T2 &_2)
 {
-  return compose(binary_operator<thrust::modulus>(),
+  return compose(transparent_binary_operator<thrust::modulus<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator%()
@@ -276,14 +283,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::modulus>,
+    transparent_binary_operator<thrust::modulus<>>,
     typename as_actor<T1>::type,
     actor<T2>
   >
 >
 operator%(const T1 &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<thrust::modulus>(),
+  return compose(transparent_binary_operator<thrust::modulus<void>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator%()
@@ -292,103 +299,138 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::modulus>,
+    transparent_binary_operator<thrust::modulus<>>,
     actor<T1>,
     actor<T2>
   >
 >
 operator%(const actor<T1> &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<thrust::modulus>(),
+  return compose(transparent_binary_operator<thrust::modulus<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator%()
 
 // there's no standard prefix_increment functional, so roll an ad hoc one here
-template<typename T>
-  struct prefix_increment
-    : public thrust::unary_function<T&,T&>
+struct prefix_increment
 {
-  __host__ __device__ T& operator()(T &x) const { return ++x; }
+  using is_transparent = void;
+
+  __thrust_exec_check_disable__
+  template <typename T1>
+  __host__ __device__
+  constexpr auto operator()(T1&& t1) const
+  noexcept(noexcept(++THRUST_FWD(t1)))
+  THRUST_TRAILING_RETURN(decltype(++THRUST_FWD(t1)))
+  {
+    return ++THRUST_FWD(t1);
+  }
 }; // end prefix_increment
 
 template<typename Eval>
 __host__ __device__
 actor<
   composite<
-    unary_operator<prefix_increment>,
+    transparent_unary_operator<prefix_increment>,
     actor<Eval>
   >
 >
 operator++(const actor<Eval> &_1)
 {
-  return compose(unary_operator<prefix_increment>(), _1);
+  return compose(transparent_unary_operator<prefix_increment>(), _1);
 } // end operator++()
 
-// there's no standard suffix_increment functional, so roll an ad hoc one here
-template<typename T>
-  struct suffix_increment
-    : public thrust::unary_function<T&,T>
+
+// there's no standard postfix_increment functional, so roll an ad hoc one here
+struct postfix_increment
 {
-  __host__ __device__ T operator()(T &x) const { return x++; }
-}; // end suffix_increment
+  using is_transparent = void;
+
+  __thrust_exec_check_disable__
+  template <typename T1>
+  __host__ __device__
+  constexpr auto operator()(T1&& t1) const
+  noexcept(noexcept(THRUST_FWD(t1)++))
+  THRUST_TRAILING_RETURN(decltype(THRUST_FWD(t1)++))
+  {
+    return THRUST_FWD(t1)++;
+  }
+}; // end postfix_increment
 
 template<typename Eval>
 __host__ __device__
 actor<
   composite<
-    unary_operator<suffix_increment>,
+    transparent_unary_operator<postfix_increment>,
     actor<Eval>
   >
 >
 operator++(const actor<Eval> &_1, int)
 {
-  return compose(unary_operator<suffix_increment>(), _1);
+  return compose(transparent_unary_operator<postfix_increment>(), _1);
 } // end operator++()
 
+
 // there's no standard prefix_decrement functional, so roll an ad hoc one here
-template<typename T>
-  struct prefix_decrement
-    : public thrust::unary_function<T&,T&>
+struct prefix_decrement
 {
-  __host__ __device__ T& operator()(T &x) const { return --x; }
+  using is_transparent = void;
+
+  __thrust_exec_check_disable__
+  template <typename T1>
+  __host__ __device__
+  constexpr auto operator()(T1&& t1) const
+  noexcept(noexcept(--THRUST_FWD(t1)))
+  THRUST_TRAILING_RETURN(decltype(--THRUST_FWD(t1)))
+  {
+    return --THRUST_FWD(t1);
+  }
 }; // end prefix_decrement
 
 template<typename Eval>
 __host__ __device__
 actor<
   composite<
-    unary_operator<prefix_decrement>,
+    transparent_unary_operator<prefix_decrement>,
     actor<Eval>
   >
 >
 operator--(const actor<Eval> &_1)
 {
-  return compose(unary_operator<prefix_decrement>(), _1);
+  return compose(transparent_unary_operator<prefix_decrement>(), _1);
 } // end operator--()
 
-// there's no standard suffix_decrement functional, so roll an ad hoc one here
-template<typename T>
-  struct suffix_decrement
-    : public thrust::unary_function<T&,T>
+
+// there's no standard postfix_decrement functional, so roll an ad hoc one here
+struct postfix_decrement
 {
-  __host__ __device__ T operator()(T &x) const { return x--; }
-}; // end suffix_decrement
+  using is_transparent = void;
+
+  __thrust_exec_check_disable__
+  template <typename T1>
+  __host__ __device__
+  constexpr auto operator()(T1&& t1) const
+  noexcept(noexcept(THRUST_FWD(t1)--))
+  THRUST_TRAILING_RETURN(decltype(THRUST_FWD(t1)--))
+  {
+    return THRUST_FWD(t1)--;
+  }
+}; // end prefix_increment
 
 template<typename Eval>
 __host__ __device__
 actor<
   composite<
-    unary_operator<suffix_decrement>,
+    transparent_unary_operator<postfix_decrement>,
     actor<Eval>
   >
 >
 operator--(const actor<Eval> &_1, int)
 {
-  return compose(unary_operator<suffix_decrement>(), _1);
+  return compose(transparent_unary_operator<postfix_decrement>(), _1);
 } // end operator--()
 
 } // end functional
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/functional/operators/assignment_operator.h b/thrust/detail/functional/operators/assignment_operator.h
index fb8958f88..870354b6f 100644
--- a/thrust/detail/functional/operators/assignment_operator.h
+++ b/thrust/detail/functional/operators/assignment_operator.h
@@ -22,8 +22,7 @@
 #include <thrust/detail/functional/operators/operator_adaptors.h>
 #include <thrust/functional.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 // XXX WAR circular inclusion with this forward declaration
 template<typename,typename,typename> struct binary_function;
@@ -37,19 +36,27 @@ namespace functional
 template<typename> struct as_actor;
 
 // there's no standard assign functional, so roll an ad hoc one here
-template<typename T>
-  struct assign
-    : thrust::binary_function<T&,T,T&>
+struct assign
 {
-  __host__ __device__ T& operator()(T &lhs, const T &rhs) const { return lhs = rhs; }
-}; // end assign
+  using is_transparent = void;
+
+  __thrust_exec_check_disable__
+  template <typename T1, typename T2>
+  __host__ __device__
+  constexpr auto operator()(T1&& t1, T2&& t2) const
+  noexcept(noexcept(THRUST_FWD(t1) = THRUST_FWD(t2)))
+  THRUST_TRAILING_RETURN(decltype(THRUST_FWD(t1) = THRUST_FWD(t2)))
+  {
+    return THRUST_FWD(t1) = THRUST_FWD(t2);
+  }
+};
 
 template<typename Eval, typename T>
   struct assign_result
 {
   typedef actor<
     composite<
-      binary_operator<assign>,
+      transparent_binary_operator<assign>,
       actor<Eval>,
       typename as_actor<T>::type
     >
@@ -61,12 +68,12 @@ template<typename Eval, typename T>
     typename assign_result<Eval,T>::type
       do_assign(const actor<Eval> &_1, const T &_2)
 {
-  return compose(binary_operator<assign>(),
+  return compose(transparent_binary_operator<assign>(),
                  _1,
                  as_actor<T>::convert(_2));
 } // end do_assign()
 
 } // end functional
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/functional/operators/bitwise_operators.h b/thrust/detail/functional/operators/bitwise_operators.h
index 796f1701c..065cd1540 100644
--- a/thrust/detail/functional/operators/bitwise_operators.h
+++ b/thrust/detail/functional/operators/bitwise_operators.h
@@ -22,8 +22,7 @@
 #include <thrust/detail/functional/operators/operator_adaptors.h>
 #include <thrust/functional.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 namespace functional
@@ -33,14 +32,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::bit_and>,
+    transparent_binary_operator<bit_and<>>,
     actor<T1>,
     typename as_actor<T2>::type
   >
 >
 operator&(const actor<T1> &_1, const T2 &_2)
 {
-  return compose(binary_operator<thrust::bit_and>(),
+  return compose(transparent_binary_operator<bit_and<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator&()
@@ -49,14 +48,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::bit_and>,
+    transparent_binary_operator<bit_and<>>,
     typename as_actor<T1>::type,
     actor<T2>
   >
 >
 operator&(const T1 &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<thrust::bit_and>(),
+  return compose(transparent_binary_operator<bit_and<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator&()
@@ -65,14 +64,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::bit_and>,
+    transparent_binary_operator<bit_and<>>,
     actor<T1>,
     actor<T2>
   >
 >
 operator&(const actor<T1> &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<thrust::bit_and>(),
+  return compose(transparent_binary_operator<bit_and<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator&()
@@ -81,14 +80,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::bit_or>,
+    transparent_binary_operator<bit_or<>>,
     actor<T1>,
     typename as_actor<T2>::type
   >
 >
 operator|(const actor<T1> &_1, const T2 &_2)
 {
-  return compose(binary_operator<thrust::bit_or>(),
+  return compose(transparent_binary_operator<bit_or<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator|()
@@ -97,14 +96,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::bit_or>,
+    transparent_binary_operator<bit_or<>>,
     typename as_actor<T1>::type,
     actor<T2>
   >
 >
 operator|(const T1 &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<thrust::bit_or>(),
+  return compose(transparent_binary_operator<bit_or<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator|()
@@ -113,14 +112,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::bit_or>,
+    transparent_binary_operator<bit_or<>>,
     actor<T1>,
     actor<T2>
   >
 >
 operator|(const actor<T1> &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<thrust::bit_or>(),
+  return compose(transparent_binary_operator<bit_or<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator|()
@@ -129,14 +128,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::bit_xor>,
+    transparent_binary_operator<bit_xor<>>,
     actor<T1>,
     typename as_actor<T2>::type
   >
 >
 operator^(const actor<T1> &_1, const T2 &_2)
 {
-  return compose(binary_operator<thrust::bit_xor>(),
+  return compose(transparent_binary_operator<bit_xor<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator^()
@@ -145,14 +144,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::bit_xor>,
+    transparent_binary_operator<bit_xor<>>,
     typename as_actor<T1>::type,
     actor<T2>
   >
 >
 operator^(const T1 &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<thrust::bit_xor>(),
+  return compose(transparent_binary_operator<bit_xor<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator^()
@@ -161,60 +160,77 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::bit_xor>,
+    transparent_binary_operator<bit_xor<>>,
     actor<T1>,
     actor<T2>
   >
 >
 operator^(const actor<T1> &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<thrust::bit_xor>(),
+  return compose(transparent_binary_operator<bit_xor<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator^()
 
+
 // there's no standard bit_not functional, so roll an ad hoc one here
-template<typename T>
-  struct bit_not
-    : public thrust::unary_function<T,T>
+struct bit_not
 {
-  __host__ __device__ T operator()(const T &x) const {return ~x;}
-}; // end bit_not
+  using is_transparent = void;
+
+  __thrust_exec_check_disable__
+  template <typename T1>
+  __host__ __device__
+  constexpr auto operator()(T1&& t1) const
+  noexcept(noexcept(~THRUST_FWD(t1)))
+  THRUST_TRAILING_RETURN(decltype(~THRUST_FWD(t1)))
+  {
+    return ~THRUST_FWD(t1);
+  }
+}; // end prefix_increment
 
 template<typename Eval>
 __host__ __device__
 actor<
   composite<
-    unary_operator<bit_not>,
+    transparent_unary_operator<bit_not>,
     actor<Eval>
   >
 >
 __host__ __device__
 operator~(const actor<Eval> &_1)
 {
-  return compose(unary_operator<bit_not>(), _1);
+  return compose(transparent_unary_operator<bit_not>(), _1);
 } // end operator~()
 
 // there's no standard bit_lshift functional, so roll an ad hoc one here
-template<typename T>
-  struct bit_lshift
-    : public thrust::binary_function<T,T,T>
+struct bit_lshift
 {
-  __host__ __device__ T operator()(const T &lhs, const T &rhs) const {return lhs << rhs;}
-}; // end bit_lshift
+  using is_transparent = void;
+
+  __thrust_exec_check_disable__
+  template <typename T1, typename T2>
+  __host__ __device__
+  constexpr auto operator()(T1&& t1, T2&& t2) const
+  noexcept(noexcept(THRUST_FWD(t1) << THRUST_FWD(t2)))
+  THRUST_TRAILING_RETURN(decltype(THRUST_FWD(t1) << THRUST_FWD(t2)))
+  {
+    return THRUST_FWD(t1) << THRUST_FWD(t2);
+  }
+};
 
 template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<bit_lshift>,
+    transparent_binary_operator<bit_lshift>,
     actor<T1>,
     typename as_actor<T2>::type
   >
 >
 operator<<(const actor<T1> &_1, const T2 &_2)
 {
-  return compose(binary_operator<bit_lshift>(),
+  return compose(transparent_binary_operator<bit_lshift>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator<<()
@@ -223,14 +239,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<bit_lshift>,
+    transparent_binary_operator<bit_lshift>,
     typename as_actor<T1>::type,
     actor<T2>
   >
 >
 operator<<(const T1 &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<bit_lshift>(),
+  return compose(transparent_binary_operator<bit_lshift>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator<<()
@@ -239,38 +255,47 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<bit_lshift>,
+    transparent_binary_operator<bit_lshift>,
     actor<T1>,
     actor<T2>
   >
 >
 operator<<(const actor<T1> &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<bit_lshift>(),
+  return compose(transparent_binary_operator<bit_lshift>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator<<()
 
 // there's no standard bit_rshift functional, so roll an ad hoc one here
-template<typename T>
-  struct bit_rshift
-    : public thrust::binary_function<T,T,T>
+struct bit_rshift
 {
-  __host__ __device__ T operator()(const T &lhs, const T &rhs) const {return lhs >> rhs;}
-}; // end bit_rshift
+  using is_transparent = void;
+
+  __thrust_exec_check_disable__
+  template <typename T1, typename T2>
+  __host__ __device__
+  constexpr auto operator()(T1& t1, T2&& t2) const
+  noexcept(noexcept(THRUST_FWD(t1) >> THRUST_FWD(t2)))
+  THRUST_TRAILING_RETURN(decltype(THRUST_FWD(t1) >> THRUST_FWD(t2)))
+  {
+    return THRUST_FWD(t1) >> THRUST_FWD(t2);
+  }
+};
+
 
 template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<bit_rshift>,
+    transparent_binary_operator<bit_rshift>,
     actor<T1>,
     typename as_actor<T2>::type
   >
 >
 operator>>(const actor<T1> &_1, const T2 &_2)
 {
-  return compose(binary_operator<bit_rshift>(),
+  return compose(transparent_binary_operator<bit_rshift>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator>>()
@@ -279,14 +304,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<bit_rshift>,
+    transparent_binary_operator<bit_rshift>,
     typename as_actor<T1>::type,
     actor<T2>
   >
 >
 operator>>(const T1 &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<bit_rshift>(),
+  return compose(transparent_binary_operator<bit_rshift>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator>>()
@@ -295,19 +320,19 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<bit_rshift>,
+    transparent_binary_operator<bit_rshift>,
     actor<T1>,
     actor<T2>
   >
 >
 operator>>(const actor<T1> &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<bit_rshift>(),
+  return compose(transparent_binary_operator<bit_rshift>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator>>()
 
 } // end functional
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/functional/operators/compound_assignment_operators.h b/thrust/detail/functional/operators/compound_assignment_operators.h
index cb8d4c105..b5ba77fb4 100644
--- a/thrust/detail/functional/operators/compound_assignment_operators.h
+++ b/thrust/detail/functional/operators/compound_assignment_operators.h
@@ -21,32 +21,40 @@
 #include <thrust/detail/functional/composite.h>
 #include <thrust/detail/functional/operators/operator_adaptors.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 namespace functional
 {
 
-template<typename T>
-  struct plus_equal
-    : public thrust::binary_function<T&,T,T&>
+// there's no standard plus_equal functional, so roll an ad hoc one here
+struct plus_equal
 {
-  __host__ __device__ T& operator()(T &lhs, const T &rhs) const { return lhs += rhs; }
-}; // end plus_equal
+  using is_transparent = void;
+
+  __thrust_exec_check_disable__
+  template <typename T1, typename T2>
+  __host__ __device__
+  constexpr auto operator()(T1&& t1, T2&& t2) const
+  noexcept(noexcept(THRUST_FWD(t1) += THRUST_FWD(t2)))
+  THRUST_TRAILING_RETURN(decltype(THRUST_FWD(t1) += THRUST_FWD(t2)))
+  {
+    return THRUST_FWD(t1) += THRUST_FWD(t2);
+  }
+};
 
 template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<plus_equal>,
+    transparent_binary_operator<plus_equal>,
     actor<T1>,
     typename as_actor<T2>::type
   >
 >
 operator+=(const actor<T1> &_1, const T2 &_2)
 {
-  return compose(binary_operator<plus_equal>(),
+  return compose(transparent_binary_operator<plus_equal>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator+=()
@@ -55,37 +63,46 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<plus_equal>,
+    transparent_binary_operator<plus_equal>,
     actor<T1>,
     actor<T2>
   >
 >
 operator+=(const actor<T1> &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<plus_equal>(),
+  return compose(transparent_binary_operator<plus_equal>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator+=()
 
-template<typename T>
-  struct minus_equal
-    : public thrust::binary_function<T&,T,T&>
+// there's no standard minus_equal functional, so roll an ad hoc one here
+struct minus_equal
 {
-  __host__ __device__ T& operator()(T &lhs, const T &rhs) const { return lhs -= rhs; }
-}; // end minus_equal
+  using is_transparent = void;
+
+  __thrust_exec_check_disable__
+  template <typename T1, typename T2>
+  __host__ __device__
+  constexpr auto operator()(T1&& t1, T2&& t2) const
+  noexcept(noexcept(THRUST_FWD(t1) -= THRUST_FWD(t2)))
+  THRUST_TRAILING_RETURN(decltype(THRUST_FWD(t1) -= THRUST_FWD(t2)))
+  {
+    return THRUST_FWD(t1) -= THRUST_FWD(t2);
+  }
+};
 
 template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<minus_equal>,
+    transparent_binary_operator<minus_equal>,
     actor<T1>,
     typename as_actor<T2>::type
   >
 >
 operator-=(const actor<T1> &_1, const T2 &_2)
 {
-  return compose(binary_operator<minus_equal>(),
+  return compose(transparent_binary_operator<minus_equal>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator-=()
@@ -94,37 +111,46 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<minus_equal>,
+    transparent_binary_operator<minus_equal>,
     actor<T1>,
     actor<T2>
   >
 >
 operator-=(const actor<T1> &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<minus_equal>(),
+  return compose(transparent_binary_operator<minus_equal>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator-=()
 
-template<typename T>
-  struct multiplies_equal
-    : public thrust::binary_function<T&,T,T&>
+// there's no standard multiplies_equal functional, so roll an ad hoc one here
+struct multiplies_equal
 {
-  __host__ __device__ T& operator()(T &lhs, const T&rhs) const { return lhs *= rhs; }
-}; // end multiplies_equal
+  using is_transparent = void;
+
+  __thrust_exec_check_disable__
+  template <typename T1, typename T2>
+  __host__ __device__
+  constexpr auto operator()(T1&& t1, T2&& t2) const
+  noexcept(noexcept(THRUST_FWD(t1) *= THRUST_FWD(t2)))
+  THRUST_TRAILING_RETURN(decltype(THRUST_FWD(t1) *= THRUST_FWD(t2)))
+  {
+    return THRUST_FWD(t1) *= THRUST_FWD(t2);
+  }
+};
 
 template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<multiplies_equal>,
+    transparent_binary_operator<multiplies_equal>,
     actor<T1>,
     typename as_actor<T2>::type
   >
 >
 operator*=(const actor<T1> &_1, const T2 &_2)
 {
-  return compose(binary_operator<multiplies_equal>(),
+  return compose(transparent_binary_operator<multiplies_equal>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator*=()
@@ -133,37 +159,46 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<multiplies_equal>,
+    transparent_binary_operator<multiplies_equal>,
     actor<T1>,
     actor<T2>
   >
 >
 operator*=(const actor<T1> &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<multiplies_equal>(),
+  return compose(transparent_binary_operator<multiplies_equal>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator*=()
 
-template<typename T>
-  struct divides_equal
-    : public thrust::binary_function<T&,T,T&>
+// there's no standard divides_equal functional, so roll an ad hoc one here
+struct divides_equal
 {
-  __host__ __device__ T& operator()(T &lhs, const T&rhs) const { return lhs /= rhs; }
-}; // end divides_equal
+  using is_transparent = void;
+
+  __thrust_exec_check_disable__
+  template <typename T1, typename T2>
+  __host__ __device__
+  constexpr auto operator()(T1&& t1, T2&& t2) const
+  noexcept(noexcept(THRUST_FWD(t1) /= THRUST_FWD(t2)))
+  THRUST_TRAILING_RETURN(decltype(THRUST_FWD(t1) /= THRUST_FWD(t2)))
+  {
+    return THRUST_FWD(t1) /= THRUST_FWD(t2);
+  }
+};
 
 template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<divides_equal>,
+    transparent_binary_operator<divides_equal>,
     actor<T1>,
     typename as_actor<T2>::type
   >
 >
 operator/=(const actor<T1> &_1, const T2 &_2)
 {
-  return compose(binary_operator<divides_equal>(),
+  return compose(transparent_binary_operator<divides_equal>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator/=()
@@ -172,37 +207,46 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<divides_equal>,
+    transparent_binary_operator<divides_equal>,
     actor<T1>,
     actor<T2>
   >
 >
 operator/=(const actor<T1> &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<divides_equal>(),
+  return compose(transparent_binary_operator<divides_equal>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator/=()
 
-template<typename T>
-  struct modulus_equal
-    : public thrust::binary_function<T&,T,T&>
+// there's no standard modulus_equal functional, so roll an ad hoc one here
+struct modulus_equal
 {
-  __host__ __device__ T& operator()(T &lhs, const T&rhs) const { return lhs %= rhs; }
-}; // end modulus_equal
+  using is_transparent = void;
+
+  __thrust_exec_check_disable__
+  template <typename T1, typename T2>
+  __host__ __device__
+  constexpr auto operator()(T1&& t1, T2&& t2) const
+  noexcept(noexcept(THRUST_FWD(t1) %= THRUST_FWD(t2)))
+  THRUST_TRAILING_RETURN(decltype(THRUST_FWD(t1) %= THRUST_FWD(t2)))
+  {
+    return THRUST_FWD(t1) %= THRUST_FWD(t2);
+  }
+};
 
 template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<modulus_equal>,
+    transparent_binary_operator<modulus_equal>,
     actor<T1>,
     typename as_actor<T2>::type
   >
 >
 operator%=(const actor<T1> &_1, const T2 &_2)
 {
-  return compose(binary_operator<modulus_equal>(),
+  return compose(transparent_binary_operator<modulus_equal>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator%=()
@@ -211,37 +255,46 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<modulus_equal>,
+    transparent_binary_operator<modulus_equal>,
     actor<T1>,
     actor<T2>
   >
 >
 operator%=(const actor<T1> &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<modulus_equal>(),
+  return compose(transparent_binary_operator<modulus_equal>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator%=()
 
-template<typename T>
-  struct bit_and_equal
-    : public thrust::binary_function<T&,T,T&>
+// there's no standard bit_and_equal functional, so roll an ad hoc one here
+struct bit_and_equal
 {
-  __host__ __device__ T& operator()(T &lhs, const T&rhs) const { return lhs &= rhs; }
-}; // end bit_and_equal
+  using is_transparent = void;
+
+  __thrust_exec_check_disable__
+  template <typename T1, typename T2>
+  __host__ __device__
+  constexpr auto operator()(T1&& t1, T2&& t2) const
+  noexcept(noexcept(THRUST_FWD(t1) &= THRUST_FWD(t2)))
+  THRUST_TRAILING_RETURN(decltype(THRUST_FWD(t1) &= THRUST_FWD(t2)))
+  {
+    return THRUST_FWD(t1) &= THRUST_FWD(t2);
+  }
+};
 
 template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<bit_and_equal>,
+    transparent_binary_operator<bit_and_equal>,
     actor<T1>,
     typename as_actor<T2>::type
   >
 >
 operator&=(const actor<T1> &_1, const T2 &_2)
 {
-  return compose(binary_operator<bit_and_equal>(),
+  return compose(transparent_binary_operator<bit_and_equal>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator&=()
@@ -250,37 +303,46 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<bit_and_equal>,
+    transparent_binary_operator<bit_and_equal>,
     actor<T1>,
     actor<T2>
   >
 >
 operator&=(const actor<T1> &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<bit_and_equal>(),
+  return compose(transparent_binary_operator<bit_and_equal>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator&=()
 
-template<typename T>
-  struct bit_or_equal
-    : public thrust::binary_function<T&,T,T&>
+// there's no standard bit_or_equal functional, so roll an ad hoc one here
+struct bit_or_equal
 {
-  __host__ __device__ T& operator()(T &lhs, const T&rhs) const { return lhs |= rhs; }
-}; // end bit_or_equal
+  using is_transparent = void;
+
+  __thrust_exec_check_disable__
+  template <typename T1, typename T2>
+  __host__ __device__
+  constexpr auto operator()(T1&& t1, T2&& t2) const
+  noexcept(noexcept(THRUST_FWD(t1) |= THRUST_FWD(t2)))
+  THRUST_TRAILING_RETURN(decltype(THRUST_FWD(t1) |= THRUST_FWD(t2)))
+  {
+    return THRUST_FWD(t1) |= THRUST_FWD(t2);
+  }
+};
 
 template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<bit_or_equal>,
+    transparent_binary_operator<bit_or_equal>,
     actor<T1>,
     typename as_actor<T2>::type
   >
 >
 operator|=(const actor<T1> &_1, const T2 &_2)
 {
-  return compose(binary_operator<bit_or_equal>(),
+  return compose(transparent_binary_operator<bit_or_equal>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator|=()
@@ -289,37 +351,46 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<bit_or_equal>,
+    transparent_binary_operator<bit_or_equal>,
     actor<T1>,
     actor<T2>
   >
 >
 operator|=(const actor<T1> &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<bit_or_equal>(),
+  return compose(transparent_binary_operator<bit_or_equal>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator|=()
 
-template<typename T>
-  struct bit_xor_equal
-    : public thrust::binary_function<T&,T,T&>
+// there's no standard bit_xor_equal functional, so roll an ad hoc one here
+struct bit_xor_equal
 {
-  __host__ __device__ T& operator()(T &lhs, const T&rhs) const { return lhs ^= rhs; }
-}; // end bit_xor_equal
+  using is_transparent = void;
+
+  __thrust_exec_check_disable__
+  template <typename T1, typename T2>
+  __host__ __device__
+  constexpr auto operator()(T1&& t1, T2&& t2) const
+  noexcept(noexcept(THRUST_FWD(t1) ^= THRUST_FWD(t2)))
+  THRUST_TRAILING_RETURN(decltype(THRUST_FWD(t1) ^= THRUST_FWD(t2)))
+  {
+    return THRUST_FWD(t1) ^= THRUST_FWD(t2);
+  }
+};
 
 template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<bit_xor_equal>,
+    transparent_binary_operator<bit_xor_equal>,
     actor<T1>,
     typename as_actor<T2>::type
   >
 >
 operator^=(const actor<T1> &_1, const T2 &_2)
 {
-  return compose(binary_operator<bit_xor_equal>(),
+  return compose(transparent_binary_operator<bit_xor_equal>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator|=()
@@ -328,37 +399,45 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<bit_xor_equal>,
+    transparent_binary_operator<bit_xor_equal>,
     actor<T1>,
     actor<T2>
   >
 >
 operator^=(const actor<T1> &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<bit_xor_equal>(),
+  return compose(transparent_binary_operator<bit_xor_equal>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator|=()
 
-template<typename T>
-  struct bit_lshift_equal
-    : public thrust::binary_function<T&,T,T&>
-{
-  __host__ __device__ T& operator()(T &lhs, const T&rhs) const { return lhs <<= rhs; }
-}; // end bit_lshift_equal
-
+// there's no standard bit_lshift_equal functional, so roll an ad hoc one here
+struct bit_lshift_equal
+{
+  using is_transparent = void;
+
+  __thrust_exec_check_disable__
+  template <typename T1, typename T2>
+  __host__ __device__
+  constexpr auto operator()(T1&& t1, T2&& t2) const
+  noexcept(noexcept(THRUST_FWD(t1) <<= THRUST_FWD(t2)))
+  THRUST_TRAILING_RETURN(decltype(THRUST_FWD(t1) <<= THRUST_FWD(t2)))
+  {
+    return THRUST_FWD(t1) <<= THRUST_FWD(t2);
+  }
+};
 template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<bit_lshift_equal>,
+    transparent_binary_operator<bit_lshift_equal>,
     actor<T1>,
     typename as_actor<T2>::type
   >
 >
 operator<<=(const actor<T1> &_1, const T2 &_2)
 {
-  return compose(binary_operator<bit_lshift_equal>(),
+  return compose(transparent_binary_operator<bit_lshift_equal>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator<<=()
@@ -367,37 +446,46 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<bit_lshift_equal>,
+    transparent_binary_operator<bit_lshift_equal>,
     actor<T1>,
     actor<T2>
   >
 >
 operator<<=(const actor<T1> &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<bit_lshift_equal>(),
+  return compose(transparent_binary_operator<bit_lshift_equal>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator<<=()
 
-template<typename T>
-  struct bit_rshift_equal
-    : public thrust::binary_function<T&,T,T&>
+// there's no standard bit_rshift_equal functional, so roll an ad hoc one here
+struct bit_rshift_equal
 {
-  __host__ __device__ T& operator()(T &lhs, const T&rhs) const { return lhs >>= rhs; }
-}; // end bit_rshift_equal
+  using is_transparent = void;
+
+  __thrust_exec_check_disable__
+  template <typename T1, typename T2>
+  __host__ __device__
+  constexpr auto operator()(T1&& t1, T2&& t2) const
+  noexcept(noexcept(THRUST_FWD(t1) >>= THRUST_FWD(t2)))
+  THRUST_TRAILING_RETURN(decltype(THRUST_FWD(t1) >>= THRUST_FWD(t2)))
+  {
+    return THRUST_FWD(t1) >>= THRUST_FWD(t2);
+  }
+};
 
 template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<bit_rshift_equal>,
+    transparent_binary_operator<bit_rshift_equal>,
     actor<T1>,
     typename as_actor<T2>::type
   >
 >
 operator>>=(const actor<T1> &_1, const T2 &_2)
 {
-  return compose(binary_operator<bit_rshift_equal>(),
+  return compose(transparent_binary_operator<bit_rshift_equal>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator>>=()
@@ -406,19 +494,19 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<bit_rshift_equal>,
+    transparent_binary_operator<bit_rshift_equal>,
     actor<T1>,
     actor<T2>
   >
 >
 operator>>=(const actor<T1> &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<bit_rshift_equal>(),
+  return compose(transparent_binary_operator<bit_rshift_equal>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator>>=()
 
 } // end functional
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/functional/operators/logical_operators.h b/thrust/detail/functional/operators/logical_operators.h
index f5e39e125..e1e4ff719 100644
--- a/thrust/detail/functional/operators/logical_operators.h
+++ b/thrust/detail/functional/operators/logical_operators.h
@@ -22,8 +22,7 @@
 #include <thrust/detail/functional/operators/operator_adaptors.h>
 #include <thrust/functional.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 namespace functional
@@ -33,14 +32,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::logical_and>,
+    transparent_binary_operator<thrust::logical_and<>>,
     actor<T1>,
     typename as_actor<T2>::type
   >
 >
 operator&&(const actor<T1> &_1, const T2 &_2)
 {
-  return compose(binary_operator<thrust::logical_and>(),
+  return compose(transparent_binary_operator<thrust::logical_and<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator&&()
@@ -49,14 +48,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::logical_and>,
+    transparent_binary_operator<thrust::logical_and<>>,
     typename as_actor<T1>::type,
     actor<T2>
   >
 >
 operator&&(const T1 &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<thrust::logical_and>(),
+  return compose(transparent_binary_operator<thrust::logical_and<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator&&()
@@ -65,14 +64,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::logical_and>,
+    transparent_binary_operator<thrust::logical_and<>>,
     actor<T1>,
     actor<T2>
   >
 >
 operator&&(const actor<T1> &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<thrust::logical_and>(),
+  return compose(transparent_binary_operator<thrust::logical_and<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator&&()
@@ -81,14 +80,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::logical_or>,
+    transparent_binary_operator<thrust::logical_or<>>,
     actor<T1>,
     typename as_actor<T2>::type
   >
 >
 operator||(const actor<T1> &_1, const T2 &_2)
 {
-  return compose(binary_operator<thrust::logical_or>(),
+  return compose(transparent_binary_operator<thrust::logical_or<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator&&()
@@ -97,14 +96,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::logical_or>,
+    transparent_binary_operator<thrust::logical_or<>>,
     typename as_actor<T1>::type,
     actor<T2>
   >
 >
 operator||(const T1 &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<thrust::logical_or>(),
+  return compose(transparent_binary_operator<thrust::logical_or<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator&&()
@@ -113,14 +112,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::logical_or>,
+    transparent_binary_operator<thrust::logical_or<>>,
     actor<T1>,
     actor<T2>
   >
 >
 operator||(const actor<T1> &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<thrust::logical_or>(),
+  return compose(transparent_binary_operator<thrust::logical_or<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator&&()
@@ -129,16 +128,16 @@ template<typename Eval>
 __host__ __device__
 actor<
   composite<
-    unary_operator<thrust::logical_not>,
+    transparent_unary_operator<thrust::logical_not<>>,
     actor<Eval>
   >
 >
 operator!(const actor<Eval> &_1)
 {
-  return compose(unary_operator<thrust::logical_not>(), _1);
+  return compose(transparent_unary_operator<thrust::logical_not<>>(), _1);
 } // end operator!()
 
 } // end functional
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/functional/operators/operator_adaptors.h b/thrust/detail/functional/operators/operator_adaptors.h
index 664921113..67326c5c1 100644
--- a/thrust/detail/functional/operators/operator_adaptors.h
+++ b/thrust/detail/functional/operators/operator_adaptors.h
@@ -17,99 +17,120 @@
 #pragma once
 
 #include <thrust/detail/config.h>
+#include <thrust/detail/functional/argument.h>
+#include <thrust/detail/type_deduction.h>
 #include <thrust/tuple.h>
 #include <thrust/detail/type_traits.h>
+#include <thrust/type_traits/void_t.h>
 
-namespace thrust
-{
+#include <type_traits>
+
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 namespace functional
 {
 
-// this thing (which models Eval) is an adaptor for the unary
-// functors inside functional.h
-template<template<typename> class UnaryOperator>
-  struct unary_operator
+// Adapts a transparent unary functor from functional.h (e.g. thrust::negate<>)
+// into the Eval interface.
+template <typename UnaryFunctor>
+struct transparent_unary_operator
 {
-  template<typename Env>
-    struct argument
-      : thrust::detail::eval_if<
-          (thrust::tuple_size<Env>::value == 0),
-          thrust::detail::identity_<thrust::null_type>,
-          thrust::tuple_element<0,Env>
-        >
-  {
-  };
+  template <typename>
+  using operator_type = UnaryFunctor;
 
-  template<typename Env>
-    struct operator_type
+  template <typename Env>
+  using argument =
+  typename thrust::detail::eval_if<
+    thrust::tuple_size<Env>::value != 1,
+    thrust::detail::identity_<thrust::null_type>,
+    thrust::detail::functional::argument_helper<0, Env>
+  >::type;
+
+  template <typename Env>
+  struct result_type_impl
   {
-    typedef UnaryOperator<
-      typename thrust::detail::remove_reference<
-        typename argument<Env>::type
-      >::type
-    > type;
+    using type = decltype(
+      std::declval<UnaryFunctor>()(std::declval<argument<Env>>()));
   };
 
-  template<typename Env>
-    struct result
+  template <typename Env>
+  using result_type =
+  typename thrust::detail::eval_if<
+    std::is_same<thrust::null_type, argument<Env>>::value,
+    thrust::detail::identity_<thrust::null_type>,
+    result_type_impl<Env>
+  >::type;
+
+  template <typename Env>
+  struct result
   {
-    typedef typename operator_type<Env>::type op_type;
-    typedef typename op_type::result_type type;
+    using op_type = UnaryFunctor;
+    using type = result_type<Env>;
   };
 
-  template<typename Env>
+  template <typename Env>
   __host__ __device__
-  typename result<Env>::type eval(const Env &e) const
-  {
-    typename operator_type<Env>::type op;
-    return op(thrust::get<0>(e));
-  } // end eval()
-}; // end unary_operator
-
-// this thing (which models Eval) is an adaptor for the binary
-// functors inside functional.h
-template<template<typename> class BinaryOperator>
-  struct binary_operator
+  result_type<Env> eval(Env&& e) const
+  THRUST_RETURNS(UnaryFunctor{}(thrust::get<0>(THRUST_FWD(e))))
+};
+
+
+// Adapts a transparent binary functor from functional.h (e.g. thrust::less<>)
+// into the Eval interface.
+template <typename BinaryFunctor>
+struct transparent_binary_operator
 {
-  template<typename Env>
-    struct first_argument
-      : thrust::detail::eval_if<
-          (thrust::tuple_size<Env>::value == 0),
-          thrust::detail::identity_<thrust::null_type>,
-          thrust::tuple_element<0,Env>
-        >
-  {
-  };
+  template <typename>
+  using operator_type = BinaryFunctor;
+
+  template <typename Env>
+  using first_argument =
+    typename thrust::detail::eval_if<
+      thrust::tuple_size<Env>::value != 2,
+      thrust::detail::identity_<thrust::null_type>,
+      thrust::detail::functional::argument_helper<0, Env>
+    >::type;
 
-  template<typename Env>
-    struct operator_type
+  template <typename Env>
+  using second_argument =
+    typename thrust::detail::eval_if<
+      thrust::tuple_size<Env>::value != 2,
+      thrust::detail::identity_<thrust::null_type>,
+      thrust::detail::functional::argument_helper<1, Env>
+    >::type;
+
+  template <typename Env>
+  struct result_type_impl
   {
-    typedef BinaryOperator<
-      typename thrust::detail::remove_reference<
-        typename first_argument<Env>::type
-      >::type
-    > type;
+    using type = decltype(
+      std::declval<BinaryFunctor>()(std::declval<first_argument<Env>>(),
+                                    std::declval<second_argument<Env>>()));
   };
 
-  template<typename Env>
-    struct result
+  template <typename Env>
+  using result_type =
+    typename thrust::detail::eval_if<
+      (std::is_same<thrust::null_type, first_argument<Env>>::value ||
+       std::is_same<thrust::null_type, second_argument<Env>>::value),
+      thrust::detail::identity_<thrust::null_type>,
+      result_type_impl<Env>
+    >::type;
+
+  template <typename Env>
+  struct result
   {
-    typedef typename operator_type<Env>::type op_type;
-    typedef typename op_type::result_type type;
+    using op_type = BinaryFunctor;
+    using type = result_type<Env>;
   };
 
-  template<typename Env>
+  template <typename Env>
   __host__ __device__
-  typename result<Env>::type eval(const Env &e) const
-  {
-    typename operator_type<Env>::type op;
-    return op(thrust::get<0>(e), thrust::get<1>(e));
-  } // end eval()
-}; // end binary_operator
+  result_type<Env> eval(Env&& e) const
+  THRUST_RETURNS(BinaryFunctor{}(thrust::get<0>(e), thrust::get<1>(e)))
+};
 
 } // end functional
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/functional/operators/relational_operators.h b/thrust/detail/functional/operators/relational_operators.h
index ec8864715..6c58325e2 100644
--- a/thrust/detail/functional/operators/relational_operators.h
+++ b/thrust/detail/functional/operators/relational_operators.h
@@ -22,8 +22,7 @@
 #include <thrust/detail/functional/operators/operator_adaptors.h>
 #include <thrust/functional.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 namespace functional
@@ -33,14 +32,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::equal_to>,
+    transparent_binary_operator<thrust::equal_to<>>,
     actor<T1>,
     typename as_actor<T2>::type
   >
 >
 operator==(const actor<T1> &_1, const T2 &_2)
 {
-  return compose(binary_operator<thrust::equal_to>(),
+  return compose(transparent_binary_operator<thrust::equal_to<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator==()
@@ -49,14 +48,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::equal_to>,
+    transparent_binary_operator<thrust::equal_to<>>,
     typename as_actor<T1>::type,
     actor<T2>
   >
 >
 operator==(const T1 &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<thrust::equal_to>(),
+  return compose(transparent_binary_operator<thrust::equal_to<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator==()
@@ -65,14 +64,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::equal_to>,
+    transparent_binary_operator<thrust::equal_to<>>,
     actor<T1>,
     actor<T2>
   >
 >
 operator==(const actor<T1> &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<thrust::equal_to>(),
+  return compose(transparent_binary_operator<thrust::equal_to<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator==()
@@ -81,14 +80,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::not_equal_to>,
+    transparent_binary_operator<thrust::not_equal_to<>>,
     actor<T1>,
     typename as_actor<T2>::type
   >
 >
 operator!=(const actor<T1> &_1, const T2 &_2)
 {
-  return compose(binary_operator<thrust::not_equal_to>(),
+  return compose(transparent_binary_operator<thrust::not_equal_to<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator!=()
@@ -97,14 +96,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::not_equal_to>,
+    transparent_binary_operator<thrust::not_equal_to<>>,
     typename as_actor<T1>::type,
     actor<T2>
   >
 >
 operator!=(const T1 &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<thrust::not_equal_to>(),
+  return compose(transparent_binary_operator<thrust::not_equal_to<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator!=()
@@ -113,14 +112,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::not_equal_to>,
+    transparent_binary_operator<thrust::not_equal_to<>>,
     actor<T1>,
     actor<T2>
   >
 >
 operator!=(const actor<T1> &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<thrust::not_equal_to>(),
+  return compose(transparent_binary_operator<thrust::not_equal_to<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator!=()
@@ -129,14 +128,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::greater>,
+    transparent_binary_operator<thrust::greater<>>,
     actor<T1>,
     typename as_actor<T2>::type
   >
 >
 operator>(const actor<T1> &_1, const T2 &_2)
 {
-  return compose(binary_operator<thrust::greater>(),
+  return compose(transparent_binary_operator<thrust::greater<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator>()
@@ -145,14 +144,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::greater>,
+    transparent_binary_operator<thrust::greater<>>,
     typename as_actor<T1>::type,
     actor<T2>
   >
 >
 operator>(const T1 &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<thrust::greater>(),
+  return compose(transparent_binary_operator<thrust::greater<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator>()
@@ -161,14 +160,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::greater>,
+    transparent_binary_operator<thrust::greater<>>,
     actor<T1>,
     actor<T2>
   >
 >
 operator>(const actor<T1> &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<thrust::greater>(),
+  return compose(transparent_binary_operator<thrust::greater<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator>()
@@ -177,14 +176,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::less>,
+    transparent_binary_operator<thrust::less<>>,
     actor<T1>,
     typename as_actor<T2>::type
   >
 >
 operator<(const actor<T1> &_1, const T2 &_2)
 {
-  return compose(binary_operator<thrust::less>(),
+  return compose(transparent_binary_operator<thrust::less<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator<()
@@ -193,14 +192,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::less>,
+    transparent_binary_operator<thrust::less<>>,
     typename as_actor<T1>::type,
     actor<T2>
   >
 >
 operator<(const T1 &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<thrust::less>(),
+  return compose(transparent_binary_operator<thrust::less<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator<()
@@ -209,14 +208,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::less>,
+    transparent_binary_operator<thrust::less<>>,
     actor<T1>,
     actor<T2>
   >
 >
 operator<(const actor<T1> &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<thrust::less>(),
+  return compose(transparent_binary_operator<thrust::less<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator<()
@@ -225,14 +224,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::greater_equal>,
+    transparent_binary_operator<thrust::greater_equal<>>,
     actor<T1>,
     typename as_actor<T2>::type
   >
 >
 operator>=(const actor<T1> &_1, const T2 &_2)
 {
-  return compose(binary_operator<thrust::greater_equal>(),
+  return compose(transparent_binary_operator<thrust::greater_equal<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator>=()
@@ -241,14 +240,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::greater_equal>,
+    transparent_binary_operator<thrust::greater_equal<>>,
     typename as_actor<T1>::type,
     actor<T2>
   >
 >
 operator>=(const T1 &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<thrust::greater_equal>(),
+  return compose(transparent_binary_operator<thrust::greater_equal<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator>=()
@@ -257,14 +256,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::greater_equal>,
+    transparent_binary_operator<thrust::greater_equal<>>,
     actor<T1>,
     actor<T2>
   >
 >
 operator>=(const actor<T1> &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<thrust::greater_equal>(),
+  return compose(transparent_binary_operator<thrust::greater_equal<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator>=()
@@ -273,14 +272,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::less_equal>,
+    transparent_binary_operator<thrust::less_equal<>>,
     actor<T1>,
     typename as_actor<T2>::type
   >
 >
 operator<=(const actor<T1> &_1, const T2 &_2)
 {
-  return compose(binary_operator<thrust::less_equal>(),
+  return compose(transparent_binary_operator<thrust::less_equal<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator<=()
@@ -289,14 +288,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::less_equal>,
+    transparent_binary_operator<thrust::less_equal<>>,
     typename as_actor<T1>::type,
     actor<T2>
   >
 >
 operator<=(const T1 &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<thrust::less_equal>(),
+  return compose(transparent_binary_operator<thrust::less_equal<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator<=()
@@ -305,19 +304,19 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::less_equal>,
+    transparent_binary_operator<thrust::less_equal<>>,
     actor<T1>,
     actor<T2>
   >
 >
 operator<=(const actor<T1> &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<thrust::less_equal>(),
+  return compose(transparent_binary_operator<thrust::less_equal<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator<=()
 
 } // end functional
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/functional/placeholder.h b/thrust/detail/functional/placeholder.h
index d0832cfec..e3c083553 100644
--- a/thrust/detail/functional/placeholder.h
+++ b/thrust/detail/functional/placeholder.h
@@ -20,8 +20,7 @@
 #include <thrust/detail/functional/actor.h>
 #include <thrust/detail/functional/argument.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 namespace functional
@@ -35,5 +34,5 @@ template<unsigned int i>
 
 } // end functional
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/functional/value.h b/thrust/detail/functional/value.h
index 27a584676..d6b1563b1 100644
--- a/thrust/detail/functional/value.h
+++ b/thrust/detail/functional/value.h
@@ -28,8 +28,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/detail/functional/actor.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 namespace functional
@@ -76,5 +75,5 @@ actor<value<T> > val(const T &x)
 
 } // end functional
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/gather.inl b/thrust/detail/gather.inl
index 4550742c5..3812702f6 100644
--- a/thrust/detail/gather.inl
+++ b/thrust/detail/gather.inl
@@ -14,10 +14,9 @@
  *  limitations under the License.
  */
 
+#pragma once
 
-/*! \file gather.inl
- *  \brief Inline file for gather.h.
- */
+#include <thrust/detail/config.h>
 
 #include <thrust/gather.h>
 #include <thrust/iterator/iterator_traits.h>
@@ -25,9 +24,7 @@
 #include <thrust/system/detail/generic/gather.h>
 #include <thrust/system/detail/adl/gather.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 __thrust_exec_check_disable__
 template<typename DerivedPolicy,
@@ -96,9 +93,9 @@ template<typename InputIterator,
 {
   using thrust::system::detail::generic::select_system;
 
-  typedef typename thrust::iterator_system<InputIterator>::type        System1; 
-  typedef typename thrust::iterator_system<RandomAccessIterator>::type System2; 
-  typedef typename thrust::iterator_system<OutputIterator>::type       System3; 
+  typedef typename thrust::iterator_system<InputIterator>::type        System1;
+  typedef typename thrust::iterator_system<RandomAccessIterator>::type System2;
+  typedef typename thrust::iterator_system<OutputIterator>::type       System3;
 
   System1 system1;
   System2 system2;
@@ -120,10 +117,10 @@ template<typename InputIterator1,
 {
   using thrust::system::detail::generic::select_system;
 
-  typedef typename thrust::iterator_system<InputIterator1>::type       System1; 
-  typedef typename thrust::iterator_system<InputIterator2>::type       System2; 
-  typedef typename thrust::iterator_system<RandomAccessIterator>::type System3; 
-  typedef typename thrust::iterator_system<OutputIterator>::type       System4; 
+  typedef typename thrust::iterator_system<InputIterator1>::type       System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type       System2;
+  typedef typename thrust::iterator_system<RandomAccessIterator>::type System3;
+  typedef typename thrust::iterator_system<OutputIterator>::type       System4;
 
   System1 system1;
   System2 system2;
@@ -148,10 +145,10 @@ template<typename InputIterator1,
 {
   using thrust::system::detail::generic::select_system;
 
-  typedef typename thrust::iterator_system<InputIterator1>::type       System1; 
-  typedef typename thrust::iterator_system<InputIterator2>::type       System2; 
-  typedef typename thrust::iterator_system<RandomAccessIterator>::type System3; 
-  typedef typename thrust::iterator_system<OutputIterator>::type       System4; 
+  typedef typename thrust::iterator_system<InputIterator1>::type       System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type       System2;
+  typedef typename thrust::iterator_system<RandomAccessIterator>::type System3;
+  typedef typename thrust::iterator_system<OutputIterator>::type       System4;
 
   System1 system1;
   System2 system2;
@@ -161,6 +158,4 @@ template<typename InputIterator1,
   return thrust::gather_if(select_system(system1,system2,system3,system4), map_first, map_last, stencil, input_first, result, pred);
 } // end gather_if()
 
-
-} // end namespace thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/generate.inl b/thrust/detail/generate.inl
index 2ce2ac936..2ecb65d58 100644
--- a/thrust/detail/generate.inl
+++ b/thrust/detail/generate.inl
@@ -14,11 +14,9 @@
  *  limitations under the License.
  */
 
+#pragma once
 
-/*! \file generate.inl
- *  \author Jared Hoberock
- *  \brief Inline file for generate.h.
- */
+#include <thrust/detail/config.h>
 
 #include <thrust/generate.h>
 #include <thrust/iterator/iterator_traits.h>
@@ -26,9 +24,7 @@
 #include <thrust/system/detail/generic/generate.h>
 #include <thrust/system/detail/adl/generate.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 __thrust_exec_check_disable__
 template<typename DerivedPolicy,
@@ -93,6 +89,4 @@ template<typename OutputIterator,
   return thrust::generate_n(select_system(system), first, n, gen);
 } // end generate_n()
 
-
-} // end thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/get_iterator_value.h b/thrust/detail/get_iterator_value.h
index a7bd1b9d9..27e0a4e47 100644
--- a/thrust/detail/get_iterator_value.h
+++ b/thrust/detail/get_iterator_value.h
@@ -21,7 +21,8 @@
 #include <thrust/detail/type_traits/pointer_traits.h>
 #include <thrust/system/detail/generic/memory.h> // for get_value()
 
-namespace thrust {
+THRUST_NAMESPACE_BEGIN
+
 namespace detail {
 
 // get_iterator_value specialization on iterators
@@ -50,4 +51,5 @@ get_iterator_value(thrust::execution_policy<DerivedPolicy> &exec, Pointer* ptr)
 } // get_iterator_value(exec,Pointer*)
 
 } // namespace detail
-} // namespace thrust
+
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/inner_product.inl b/thrust/detail/inner_product.inl
index 37247e68e..97cd2b0b5 100644
--- a/thrust/detail/inner_product.inl
+++ b/thrust/detail/inner_product.inl
@@ -14,10 +14,7 @@
  *  limitations under the License.
  */
 
-
-/*! \file inner_product.inl
- *  \brief Inline file for inner_product.h.
- */
+#pragma once
 
 #include <thrust/detail/config.h>
 #include <thrust/inner_product.h>
@@ -26,9 +23,7 @@
 #include <thrust/system/detail/generic/inner_product.h>
 #include <thrust/system/detail/adl/inner_product.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 __thrust_exec_check_disable__
 template<typename DerivedPolicy,
@@ -59,7 +54,7 @@ OutputType inner_product(const thrust::detail::execution_policy_base<DerivedPoli
                          InputIterator1 first1,
                          InputIterator1 last1,
                          InputIterator2 first2,
-                         OutputType init, 
+                         OutputType init,
                          BinaryFunction1 binary_op1,
                          BinaryFunction2 binary_op2)
 {
@@ -69,7 +64,7 @@ OutputType inner_product(const thrust::detail::execution_policy_base<DerivedPoli
 
 
 template<typename InputIterator1, typename InputIterator2, typename OutputType>
-OutputType 
+OutputType
 inner_product(InputIterator1 first1, InputIterator1 last1,
               InputIterator2 first2, OutputType init)
 {
@@ -89,7 +84,7 @@ template<typename InputIterator1, typename InputIterator2, typename OutputType,
          typename BinaryFunction1, typename BinaryFunction2>
 OutputType
 inner_product(InputIterator1 first1, InputIterator1 last1,
-              InputIterator2 first2, OutputType init, 
+              InputIterator2 first2, OutputType init,
               BinaryFunction1 binary_op1, BinaryFunction2 binary_op2)
 {
   using thrust::system::detail::generic::select_system;
@@ -103,6 +98,4 @@ inner_product(InputIterator1 first1, InputIterator1 last1,
   return thrust::inner_product(select_system(system1,system2), first1, last1, first2, init, binary_op1, binary_op2);
 } // end inner_product()
 
-
-} // end namespace thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/integer_math.h b/thrust/detail/integer_math.h
index d64577c68..0f8c8aac1 100644
--- a/thrust/detail/integer_math.h
+++ b/thrust/detail/integer_math.h
@@ -17,14 +17,13 @@
 #pragma once
 
 #include <thrust/detail/config.h>
-#include <limits>
+#include <thrust/detail/type_deduction.h>
 
-#if THRUST_CPP_DIALECT >= 2011
-  #include <thrust/detail/type_deduction.h>
-#endif
+#include <nv/target>
 
-namespace thrust
-{
+#include <limits>
+
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 
@@ -32,22 +31,25 @@ template <typename Integer>
 __host__ __device__ __thrust_forceinline__
 Integer clz(Integer x)
 {
-#if __CUDA_ARCH__
-  return ::__clz(x);
-#else
-  int num_bits = 8 * sizeof(Integer);
-  int num_bits_minus_one = num_bits - 1;
-
-  for (int i = num_bits_minus_one; i >= 0; --i)
-  {
-    if ((Integer(1) << i) & x)
+  Integer result;
+
+  NV_IF_TARGET(NV_IS_DEVICE, (
+    result = ::__clz(x);
+  ), (
+    int num_bits = 8 * sizeof(Integer);
+    int num_bits_minus_one = num_bits - 1;
+    result = num_bits;
+    for (int i = num_bits_minus_one; i >= 0; --i)
     {
-      return num_bits_minus_one - i;
+      if ((Integer(1) << i) & x)
+      {
+        result = num_bits_minus_one - i;
+        break;
+      }
     }
-  }
+  ));
 
-  return num_bits;
-#endif
+  return result;
 }
 
 template <typename Integer>
@@ -146,5 +148,5 @@ Integer0 round_z(Integer0 const x, Integer1 const y)
 #endif
 
 } // end detail
-} // end thrust
 
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/integer_traits.h b/thrust/detail/integer_traits.h
index 97ab4f94d..853af20b8 100644
--- a/thrust/detail/integer_traits.h
+++ b/thrust/detail/integer_traits.h
@@ -20,8 +20,7 @@
 #include <limits>
 #include <limits.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace detail
 {
@@ -30,16 +29,16 @@ template<typename T>
   class integer_traits
 {
   public:
-    static const bool is_integral = false;
+    static constexpr bool is_integral = false;
 };
 
 template<typename T, T min_val, T max_val>
   class integer_traits_base
 {
   public:
-    static const bool is_integral = true;
-    static const T const_min = min_val;
-    static const T const_max = max_val;
+    static constexpr bool is_integral = true;
+    static constexpr T const_min = min_val;
+    static constexpr T const_max = max_val;
 };
 
 
@@ -128,5 +127,4 @@ template<>
 
 } // end detail
 
-} // end thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/internal_functional.h b/thrust/detail/internal_functional.h
index 737e75eb4..a0c4056fe 100644
--- a/thrust/detail/internal_functional.h
+++ b/thrust/detail/internal_functional.h
@@ -23,13 +23,15 @@
 
 #include <thrust/tuple.h>
 #include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/config.h>
+#include <thrust/detail/static_assert.h>
 #include <thrust/detail/type_traits.h>
 #include <thrust/iterator/detail/tuple_of_iterator_references.h>
 #include <thrust/detail/raw_reference_cast.h>
-#include <memory> // for ::new
+#include <thrust/detail/memory_wrapper.h> // for ::new
+
+THRUST_NAMESPACE_BEGIN
 
-namespace thrust
-{
 namespace detail
 {
 
@@ -38,12 +40,12 @@ template<typename Predicate>
 struct unary_negate
 {
   typedef bool result_type;
-  
+
   Predicate pred;
-  
+
   __host__ __device__
   explicit unary_negate(const Predicate& pred) : pred(pred) {}
-  
+
   template <typename T>
   __host__ __device__
   bool operator()(const T& x)
@@ -57,12 +59,12 @@ template<typename Predicate>
 struct binary_negate
 {
   typedef bool result_type;
-  
+
   Predicate pred;
-  
+
   __host__ __device__
   explicit binary_negate(const Predicate& pred) : pred(pred) {}
-  
+
   template <typename T1, typename T2>
   __host__ __device__
   bool operator()(const T1& x, const T2& y)
@@ -91,13 +93,13 @@ template<typename Predicate, typename IntegralType>
 struct predicate_to_integral
 {
   Predicate pred;
-  
+
   __host__ __device__
   explicit predicate_to_integral(const Predicate& pred) : pred(pred) {}
-  
+
   template <typename T>
   __host__ __device__
-  bool operator()(const T& x)
+  IntegralType operator()(const T& x)
   {
     return pred(x) ? IntegralType(1) : IntegralType(0);
   }
@@ -109,7 +111,7 @@ template<typename T1>
 struct equal_to
 {
   typedef bool result_type;
-  
+
   template <typename T2>
   __host__ __device__
   bool operator()(const T1& lhs, const T2& rhs) const
@@ -123,10 +125,10 @@ template<typename T2>
 struct equal_to_value
 {
   T2 rhs;
-  
+
   __host__ __device__
   equal_to_value(const T2& rhs) : rhs(rhs) {}
-  
+
   template <typename T1>
   __host__ __device__
   bool operator()(const T1& lhs) const
@@ -139,17 +141,17 @@ template<typename Predicate>
 struct tuple_binary_predicate
 {
   typedef bool result_type;
-  
+
   __host__ __device__
   tuple_binary_predicate(const Predicate& p) : pred(p) {}
-  
+
   template<typename Tuple>
   __host__ __device__
   bool operator()(const Tuple& t) const
-  { 
+  {
     return pred(thrust::get<0>(t), thrust::get<1>(t));
   }
-  
+
   mutable Predicate pred;
 };
 
@@ -157,17 +159,17 @@ template<typename Predicate>
 struct tuple_not_binary_predicate
 {
   typedef bool result_type;
-  
+
   __host__ __device__
   tuple_not_binary_predicate(const Predicate& p) : pred(p) {}
-  
+
   template<typename Tuple>
   __host__ __device__
   bool operator()(const Tuple& t) const
-  { 
+  {
     return !pred(thrust::get<0>(t), thrust::get<1>(t));
   }
-  
+
   mutable Predicate pred;
 };
 
@@ -280,13 +282,10 @@ template<typename T>
 
 template<typename T> struct is_tuple_of_iterator_references : thrust::detail::false_type {};
 
-template<typename T1, typename T2, typename T3,
-         typename T4, typename T5, typename T6,
-         typename T7, typename T8, typename T9,
-         typename T10>
+template<typename... Ts>
   struct is_tuple_of_iterator_references<
     thrust::detail::tuple_of_iterator_references<
-      T1,T2,T3,T4,T5,T6,T7,T8,T9,T10
+      Ts...
     >
   >
     : thrust::detail::true_type
@@ -410,7 +409,7 @@ struct binary_transform_if_functor
 
   __host__ __device__
   binary_transform_if_functor(BinaryFunction binary_op, Predicate pred)
-    : binary_op(binary_op), pred(pred) {} 
+    : binary_op(binary_op), pred(pred) {}
 
   __thrust_exec_check_disable__
   template<typename Tuple>
@@ -466,7 +465,7 @@ struct fill_functor
 
   __thrust_exec_check_disable__
   __host__ __device__
-  fill_functor(const T& _exemplar) 
+  fill_functor(const T& _exemplar)
     : exemplar(_exemplar) {}
 
   __thrust_exec_check_disable__
@@ -555,5 +554,5 @@ template<typename Compare>
 
 
 } // end namespace detail
-} // end namespace thrust
 
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/logical.inl b/thrust/detail/logical.inl
index 2f428bc5f..3d39cac92 100644
--- a/thrust/detail/logical.inl
+++ b/thrust/detail/logical.inl
@@ -14,10 +14,7 @@
  *  limitations under the License.
  */
 
-
-/*! \file logical.inl
- *  \brief Inline file for logical.h.
- */
+#pragma once
 
 #include <thrust/detail/config.h>
 #include <thrust/iterator/iterator_traits.h>
@@ -25,9 +22,7 @@
 #include <thrust/system/detail/generic/logical.h>
 #include <thrust/system/detail/adl/logical.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 __thrust_exec_check_disable__
 template<typename DerivedPolicy, typename InputIterator, typename Predicate>
@@ -97,6 +92,4 @@ bool none_of(InputIterator first, InputIterator last, Predicate pred)
   return thrust::none_of(select_system(system), first, last, pred);
 }
 
-
-} // end namespace thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/malloc_and_free.h b/thrust/detail/malloc_and_free.h
index 00d9dff18..143518893 100644
--- a/thrust/detail/malloc_and_free.h
+++ b/thrust/detail/malloc_and_free.h
@@ -23,8 +23,7 @@
 #include <thrust/system/detail/generic/memory.h>
 #include <thrust/system/detail/adl/malloc_and_free.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 __thrust_exec_check_disable__
 template<typename DerivedPolicy>
@@ -54,7 +53,7 @@ pointer<T,DerivedPolicy> malloc(const thrust::detail::execution_policy_base<Deri
 
 // XXX WAR nvbug 992955
 #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
-#if CUDA_VERSION < 5000
+#if CUDART_VERSION < 5000
 
 // cudafe generates unqualified calls to free(int *volatile)
 // which get confused with thrust::free
@@ -65,7 +64,7 @@ void free(int *volatile ptr)
   ::free(ptr);
 }
 
-#endif // CUDA_VERSION
+#endif // CUDART_VERSION
 #endif // THRUST_DEVICE_COMPILER
 
 __thrust_exec_check_disable__
@@ -81,5 +80,4 @@ void free(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, Poin
 // XXX consider another form of free which does not take a system argument and
 // instead infers the system from the pointer
 
-} // end namespace thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/memory_algorithms.h b/thrust/detail/memory_algorithms.h
index 74e863dcc..2f6b3a81d 100644
--- a/thrust/detail/memory_algorithms.h
+++ b/thrust/detail/memory_algorithms.h
@@ -8,16 +8,20 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
 #include <thrust/detail/type_traits.h>
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/detail/allocator/allocator_traits.h>
+#include <thrust/detail/memory_wrapper.h>
 #include <thrust/addressof.h>
 
+#include <nv/target>
+
 #include <utility>
 #include <new>
-#include <memory>
 
-THRUST_BEGIN_NS
+
+THRUST_NAMESPACE_BEGIN
 
 ///////////////////////////////////////////////////////////////////////////////
 
@@ -101,7 +105,6 @@ ForwardIt destroy_n(Allocator const& alloc, ForwardIt first, Size n)
   return first;
 }
 
-#if __cplusplus >= 201103L
 template <typename ForwardIt, typename... Args>
 __host__ __device__
 void uninitialized_construct(
@@ -111,17 +114,24 @@ void uninitialized_construct(
   using T = typename iterator_traits<ForwardIt>::value_type;
 
   ForwardIt current = first;
-  #if !__CUDA_ARCH__ // No exceptions in CUDA.
-  try {
-  #endif
+
+  // No exceptions in CUDA.
+  NV_IF_TARGET(NV_IS_HOST, (
+    try {
+      for (; current != last; ++current)
+      {
+        ::new (static_cast<void*>(addressof(*current))) T(args...);
+      }
+    } catch (...) {
+      destroy(first, current);
+      throw;
+    }
+  ), (
     for (; current != last; ++current)
+    {
       ::new (static_cast<void*>(addressof(*current))) T(args...);
-  #if !__CUDA_ARCH__ // No exceptions in CUDA.
-  } catch (...) {
-    destroy(first, current);
-    throw;
-  }
-  #endif
+    }
+  ));
 }
 
 template <typename Allocator, typename ForwardIt, typename... Args>
@@ -139,17 +149,24 @@ void uninitialized_construct_with_allocator(
   typename traits::allocator_type alloc_T(alloc);
 
   ForwardIt current = first;
-  #if !__CUDA_ARCH__ // No exceptions in CUDA.
-  try {
-  #endif
+
+  // No exceptions in CUDA.
+  NV_IF_TARGET(NV_IS_HOST, (
+    try {
+      for (; current != last; ++current)
+      {
+        traits::construct(alloc_T, addressof(*current), args...);
+      }
+    } catch (...) {
+      destroy(alloc_T, first, current);
+      throw;
+    }
+  ), (
     for (; current != last; ++current)
+    {
       traits::construct(alloc_T, addressof(*current), args...);
-  #if !__CUDA_ARCH__ // No exceptions in CUDA.
-  } catch (...) {
-    destroy(alloc_T, first, current);
-    throw;
-  }
-  #endif
+    }
+  ));
 }
 
 template <typename ForwardIt, typename Size, typename... Args>
@@ -160,17 +177,24 @@ void uninitialized_construct_n(
   using T = typename iterator_traits<ForwardIt>::value_type;
 
   ForwardIt current = first;
-  #if !__CUDA_ARCH__ // No exceptions in CUDA.
-  try {
-  #endif
-    for (; n > 0; (void) ++current, --n)
+
+  // No exceptions in CUDA.
+  NV_IF_TARGET(NV_IS_HOST, (
+    try {
+      for (; n > 0; ++current, --n)
+      {
+        ::new (static_cast<void*>(addressof(*current))) T(args...);
+      }
+    } catch (...) {
+      destroy(first, current);
+      throw;
+    }
+  ), (
+    for (; n > 0; ++current, --n)
+    {
       ::new (static_cast<void*>(addressof(*current))) T(args...);
-  #if !__CUDA_ARCH__ // No exceptions in CUDA.
-  } catch (...) {
-    destroy(first, current);
-    throw;
-  }
-  #endif
+    }
+  ));
 }
 
 template <typename Allocator, typename ForwardIt, typename Size, typename... Args>
@@ -188,21 +212,26 @@ void uninitialized_construct_n_with_allocator(
   typename traits::allocator_type alloc_T(alloc);
 
   ForwardIt current = first;
-  #if !__CUDA_ARCH__ // No exceptions in CUDA.
-  try {
-  #endif
+
+  // No exceptions in CUDA.
+  NV_IF_TARGET(NV_IS_HOST, (
+    try {
+      for (; n > 0; (void) ++current, --n)
+      {
+        traits::construct(alloc_T, addressof(*current), args...);
+      }
+    } catch (...) {
+      destroy(alloc_T, first, current);
+      throw;
+    }
+  ), (
     for (; n > 0; (void) ++current, --n)
+    {
       traits::construct(alloc_T, addressof(*current), args...);
-  #if !__CUDA_ARCH__ // No exceptions in CUDA.
-  } catch (...) {
-    destroy(alloc_T, first, current);
-    throw;
-  }
-  #endif
+    }
+  ));
 }
-#endif
 
 ///////////////////////////////////////////////////////////////////////////////
 
-THRUST_END_NS
-
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/memory_wrapper.h b/thrust/detail/memory_wrapper.h
new file mode 100644
index 000000000..bfc9056fa
--- /dev/null
+++ b/thrust/detail/memory_wrapper.h
@@ -0,0 +1,30 @@
+/*
+ *  Copyright 2020 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+// When a compiler uses Thrust as part of its implementation of Standard C++
+// algorithms, a cycle of included files may result when Thrust code tries to
+// use a standard algorithm.  Having a macro that is defined only when Thrust
+// is including an algorithms-related header gives the compiler a chance to
+// detect and break the cycle of includes.  (<memory> declares several standard
+// algorithms, including all of the uninitialized_* algorithms.  "_ALGORITHMS_"
+// in the macro name is meant generically, not as a specific reference to
+// the header <algorithms>.)
+
+#define THRUST_INCLUDING_ALGORITHMS_HEADER
+#include <memory>
+#undef  THRUST_INCLUDING_ALGORITHMS_HEADER
diff --git a/thrust/detail/merge.inl b/thrust/detail/merge.inl
index d42475709..1595cc1a1 100644
--- a/thrust/detail/merge.inl
+++ b/thrust/detail/merge.inl
@@ -14,9 +14,9 @@
  *  limitations under the License.
  */
 
-/*! \file merge.inl
- *  \brief Inline file for merge.h.
- */
+#pragma once
+
+#include <thrust/detail/config.h>
 
 #include <thrust/merge.h>
 #include <thrust/iterator/iterator_traits.h>
@@ -24,9 +24,7 @@
 #include <thrust/system/detail/generic/merge.h>
 #include <thrust/system/detail/adl/merge.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 __thrust_exec_check_disable__
 template<typename DerivedPolicy,
@@ -220,6 +218,4 @@ template<typename InputIterator1,
   return thrust::merge_by_key(select_system(system1,system2,system3,system4,system5,system6), keys_first1, keys_last1, keys_first2, keys_last2, values_first1, values_first2, keys_result, values_result);
 } // end merge_by_key()
 
-
-} // end thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/minmax.h b/thrust/detail/minmax.h
index f59c64962..c565a74bd 100644
--- a/thrust/detail/minmax.h
+++ b/thrust/detail/minmax.h
@@ -18,9 +18,7 @@
 
 #include <thrust/detail/config.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 template<typename T, typename BinaryPredicate>
 __host__ __device__
@@ -50,6 +48,4 @@ __host__ __device__
   return lhs < rhs ? rhs : lhs;
 } // end max()
 
-
-} // end thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/mismatch.inl b/thrust/detail/mismatch.inl
index 6c39aab86..16c579d80 100644
--- a/thrust/detail/mismatch.inl
+++ b/thrust/detail/mismatch.inl
@@ -14,11 +14,7 @@
  *  limitations under the License.
  */
 
-
-/*! \file mismatch.inl
- *  \brief Inline file for mismatch.h
- */
-
+#pragma once
 
 #include <thrust/detail/config.h>
 #include <thrust/mismatch.h>
@@ -27,9 +23,7 @@
 #include <thrust/system/detail/generic/mismatch.h>
 #include <thrust/system/detail/adl/mismatch.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 __thrust_exec_check_disable__
 template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2>
@@ -92,6 +86,4 @@ thrust::pair<InputIterator1, InputIterator2> mismatch(InputIterator1 first1,
   return thrust::mismatch(select_system(system1,system2), first1, last1, first2, pred);
 } // end mismatch()
 
-
-} // end namespace thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/mpl/math.h b/thrust/detail/mpl/math.h
index 5356c9c15..bda98003c 100644
--- a/thrust/detail/mpl/math.h
+++ b/thrust/detail/mpl/math.h
@@ -22,8 +22,9 @@
 
 #pragma once
 
-namespace thrust
-{
+#include <thrust/detail/config.h>
+
+THRUST_NAMESPACE_BEGIN
 
 namespace detail
 {
@@ -170,5 +171,5 @@ template<typename result_type, result_type x>
 
 } // end namespace detail
 
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/numeric_traits.h b/thrust/detail/numeric_traits.h
index 168b9ad0f..e728adcaf 100644
--- a/thrust/detail/numeric_traits.h
+++ b/thrust/detail/numeric_traits.h
@@ -16,13 +16,13 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
 #include <thrust/detail/type_traits.h>
 #include <limits>
 
 //#include <stdint.h> // for intmax_t (not provided on MSVS 2005)
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace detail
 {
@@ -126,5 +126,4 @@ numeric_distance(Number x, Number y)
 
 } // end detail
 
-} // end thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/numeric_wrapper.h b/thrust/detail/numeric_wrapper.h
new file mode 100644
index 000000000..9ebc6e23b
--- /dev/null
+++ b/thrust/detail/numeric_wrapper.h
@@ -0,0 +1,27 @@
+/*
+ *  Copyright 2021 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+// When a compiler uses Thrust as part of its implementation of Standard C++
+// algorithms, a cycle of included files may result when Thrust code tries to
+// use a standard algorithm.  Having a macro that is defined only when Thrust
+// is including an algorithms-related header gives the compiler a chance to
+// detect and break the cycle of includes.
+
+#define THRUST_INCLUDING_ALGORITHMS_HEADER
+#include <numeric>
+#undef  THRUST_INCLUDING_ALGORITHMS_HEADER
diff --git a/thrust/detail/overlapped_copy.h b/thrust/detail/overlapped_copy.h
index f6bb85a91..418497de8 100644
--- a/thrust/detail/overlapped_copy.h
+++ b/thrust/detail/overlapped_copy.h
@@ -23,8 +23,8 @@
 #include <thrust/detail/temporary_array.h>
 #include <thrust/system/cpp/detail/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
+
 namespace detail
 {
 
@@ -127,5 +127,5 @@ template<typename RandomAccessIterator1,
 } // end overlapped_copy()
 
 } // end detail
-} // end thrust
 
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/pair.inl b/thrust/detail/pair.inl
index 426668b99..4b7dd6eb0 100644
--- a/thrust/detail/pair.inl
+++ b/thrust/detail/pair.inl
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2021 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -14,11 +14,15 @@
  *  limitations under the License.
  */
 
+#pragma once
+
+#include <thrust/detail/config.h>
+
 #include <thrust/pair.h>
 #include <thrust/detail/swap.h>
+#include <thrust/tuple.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 template <typename T1, typename T2>
   __host__ __device__
@@ -140,13 +144,13 @@ template <typename T1, typename T2>
 
 // specializations of tuple_element for pair
 template<typename T1, typename T2>
-  struct tuple_element<0, pair<T1,T2> >
+  struct tuple_element<0, pair<T1,T2>>
 {
   typedef T1 type;
 }; // end tuple_element
 
 template<typename T1, typename T2>
-  struct tuple_element<1, pair<T1,T2> >
+  struct tuple_element<1, pair<T1,T2>>
 {
   typedef T2 type;
 }; // end tuple_element
@@ -154,7 +158,7 @@ template<typename T1, typename T2>
 
 // specialization of tuple_size for pair
 template<typename T1, typename T2>
-  struct tuple_size< pair<T1,T2 > >
+  struct tuple_size<pair<T1,T2>>
 {
   static const unsigned int value = 2;
 }; // end tuple_size
@@ -224,6 +228,4 @@ template<unsigned int N, typename T1, typename T2>
   return detail::pair_get<N, pair<T1,T2> >()(p);
 } // end get()
 
-
-} // end thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/partition.inl b/thrust/detail/partition.inl
index a667264c6..5c51bca80 100644
--- a/thrust/detail/partition.inl
+++ b/thrust/detail/partition.inl
@@ -14,10 +14,7 @@
  *  limitations under the License.
  */
 
-
-/*! \file partition.inl
- *  \brief Inline file for partition.h.
- */
+#pragma once
 
 #include <thrust/detail/config.h>
 #include <thrust/partition.h>
@@ -26,9 +23,7 @@
 #include <thrust/system/detail/generic/partition.h>
 #include <thrust/system/detail/adl/partition.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 __thrust_exec_check_disable__
 template<typename DerivedPolicy,
@@ -413,6 +408,4 @@ template<typename InputIterator, typename Predicate>
   return thrust::is_partitioned(select_system(system), first, last, pred);
 } // end is_partitioned()
 
-
-} // end thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/pointer.h b/thrust/detail/pointer.h
index baacac7fa..aed1fcc24 100644
--- a/thrust/detail/pointer.h
+++ b/thrust/detail/pointer.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2018 NVIDIA Corporation
+ *  Copyright 2008-2021 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -14,53 +14,60 @@
  *  limitations under the License.
  */
 
+/*! \file
+ *  \brief A pointer to a variable which resides in memory associated with a
+ *  system.
+ */
+
 #pragma once
 
 #include <thrust/detail/config.h>
 #include <thrust/iterator/iterator_adaptor.h>
 #include <thrust/iterator/detail/iterator_traversal_tags.h>
+#include <thrust/type_traits/remove_cvref.h>
 #include <thrust/detail/type_traits/pointer_traits.h>
 #include <thrust/detail/type_traits.h>
 #include <thrust/detail/reference_forward_declaration.h>
 #include <ostream>
+#include <cstddef>
 
+THRUST_NAMESPACE_BEGIN
 
-namespace thrust
-{
+template <typename Element, typename Tag, typename Reference = use_default, typename Derived = use_default>
+class pointer;
 
-// declare pointer with default values of template parameters
-template<typename Element, typename Tag, typename Reference = use_default, typename Derived = use_default> class pointer;
-
-} // end thrust
+// Specialize `thrust::iterator_traits` to avoid problems with the name of
+// pointer's constructor shadowing its nested pointer type. We do this before
+// pointer is defined so the specialization is correctly used inside the
+// definition.
+template <typename Element, typename Tag, typename Reference, typename Derived>
+struct iterator_traits<thrust::pointer<Element, Tag, Reference, Derived>>
+{
+  using pointer           = thrust::pointer<Element, Tag, Reference, Derived>;
+  using iterator_category = typename pointer::iterator_category;
+  using value_type        = typename pointer::value_type;
+  using difference_type   = typename pointer::difference_type;
+  using reference         = typename pointer::reference;
+};
 
+THRUST_NAMESPACE_END
 
-// specialize thrust::iterator_traits to avoid problems with the name of
-// pointer's constructor shadowing its nested pointer type
-// do this before pointer is defined so the specialization is correctly
-// used inside the definition
-namespace thrust
+namespace std
 {
 
-template<typename Element, typename Tag, typename Reference, typename Derived>
-  struct iterator_traits<thrust::pointer<Element,Tag,Reference,Derived> >
+template <typename Element, typename Tag, typename Reference, typename Derived>
+struct iterator_traits<THRUST_NS_QUALIFIER::pointer<Element, Tag, Reference, Derived>>
 {
-  private:
-    typedef thrust::pointer<Element,Tag,Reference,Derived> ptr;
+  using pointer           = THRUST_NS_QUALIFIER::pointer<Element, Tag, Reference, Derived>;
+  using iterator_category = typename pointer::iterator_category;
+  using value_type        = typename pointer::value_type;
+  using difference_type   = typename pointer::difference_type;
+  using reference         = typename pointer::reference;
+};
 
-  public:
-    typedef typename ptr::iterator_category iterator_category;
-    typedef typename ptr::value_type        value_type;
-    typedef typename ptr::difference_type   difference_type;
-    // XXX implement this type (the result of operator->) later
-    typedef void                             pointer;
-    typedef typename ptr::reference         reference;
-}; // end iterator_traits
-
-} // end thrust
+} // namespace std
 
-
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace detail
 {
@@ -72,7 +79,7 @@ template<typename Element, typename Tag, typename Reference, typename Derived>
   // void pointers should have no element type
   // note that we remove_cv from the Element type to get the value_type
   typedef typename thrust::detail::eval_if<
-    thrust::detail::is_void<typename thrust::detail::remove_const<Element>::type>::value,
+    thrust::detail::is_void<typename thrust::remove_cvref<Element>::type>::value,
     thrust::detail::identity_<void>,
     thrust::detail::remove_cv<Element>
   >::type value_type;
@@ -87,14 +94,14 @@ template<typename Element, typename Tag, typename Reference, typename Derived>
   // void pointers should have no reference type
   // if no Reference type is given, just use reference
   typedef typename thrust::detail::eval_if<
-    thrust::detail::is_void<typename thrust::detail::remove_const<Element>::type>::value,
+    thrust::detail::is_void<typename thrust::remove_cvref<Element>::type>::value,
     thrust::detail::identity_<void>,
     thrust::detail::eval_if<
       thrust::detail::is_same<Reference,use_default>::value,
       thrust::detail::identity_<reference<Element,derived_type> >,
       thrust::detail::identity_<Reference>
     >
-  >::type reference_arg;
+  >::type reference_type;
 
   typedef thrust::iterator_adaptor<
     derived_type,                        // pass along the type of our Derived class to iterator_adaptor
@@ -102,7 +109,7 @@ template<typename Element, typename Tag, typename Reference, typename Derived>
     value_type,                          // the value type
     Tag,                                 // system tag
     thrust::random_access_traversal_tag, // pointers have random access traversal
-    reference_arg,                       // pass along our Reference type
+    reference_type,                      // pass along our Reference type
     std::ptrdiff_t
   > type;
 }; // end pointer_base
@@ -146,12 +153,10 @@ template<typename Element, typename Tag, typename Reference, typename Derived>
     __host__ __device__
     pointer();
 
-    #if THRUST_CPP_DIALECT >= 2011
     // NOTE: This is needed so that Thrust smart pointers can be used in
     // `std::unique_ptr`.
     __host__ __device__
-    pointer(decltype(nullptr));
-    #endif
+    pointer(std::nullptr_t);
 
     // OtherValue shall be convertible to Value
     // XXX consider making the pointer implementation a template parameter which defaults to Element *
@@ -182,12 +187,10 @@ template<typename Element, typename Tag, typename Reference, typename Derived>
 
     // assignment
 
-    #if THRUST_CPP_DIALECT >= 2011
     // NOTE: This is needed so that Thrust smart pointers can be used in
     // `std::unique_ptr`.
     __host__ __device__
-    derived_type& operator=(decltype(nullptr));
-    #endif
+    derived_type& operator=(std::nullptr_t);
 
     // OtherPointer's element_type shall be convertible to Element
     // OtherPointer's system shall be convertible to Tag
@@ -205,12 +208,19 @@ template<typename Element, typename Tag, typename Reference, typename Derived>
     __host__ __device__
     Element *get() const;
 
-    #if THRUST_CPP_DIALECT >= 2011
+    __host__ __device__
+    Element *operator->() const;
+
     // NOTE: This is needed so that Thrust smart pointers can be used in
     // `std::unique_ptr`.
     __host__ __device__
     explicit operator bool() const;
-    #endif
+
+    __host__ __device__
+    static derived_type pointer_to(typename thrust::detail::pointer_traits_detail::pointer_to_param<Element>::type r)
+    {
+      return thrust::detail::pointer_traits<derived_type>::pointer_to(r);
+    }
 }; // end pointer
 
 // Output stream operator
@@ -221,27 +231,25 @@ std::basic_ostream<charT, traits> &
 operator<<(std::basic_ostream<charT, traits> &os,
            const pointer<Element, Tag, Reference, Derived> &p);
 
-#if THRUST_CPP_DIALECT >= 2011
 // NOTE: This is needed so that Thrust smart pointers can be used in
 // `std::unique_ptr`.
 template <typename Element, typename Tag, typename Reference, typename Derived>
 __host__ __device__
-bool operator==(decltype(nullptr), pointer<Element, Tag, Reference, Derived> p);
+bool operator==(std::nullptr_t, pointer<Element, Tag, Reference, Derived> p);
 
 template <typename Element, typename Tag, typename Reference, typename Derived>
 __host__ __device__
-bool operator==(pointer<Element, Tag, Reference, Derived> p, decltype(nullptr));
+bool operator==(pointer<Element, Tag, Reference, Derived> p, std::nullptr_t);
 
 template <typename Element, typename Tag, typename Reference, typename Derived>
 __host__ __device__
-bool operator!=(decltype(nullptr), pointer<Element, Tag, Reference, Derived> p);
+bool operator!=(std::nullptr_t, pointer<Element, Tag, Reference, Derived> p);
 
 template <typename Element, typename Tag, typename Reference, typename Derived>
 __host__ __device__
-bool operator!=(pointer<Element, Tag, Reference, Derived> p, decltype(nullptr));
-#endif
+bool operator!=(pointer<Element, Tag, Reference, Derived> p, std::nullptr_t);
 
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/pointer.inl>
 
diff --git a/thrust/detail/pointer.inl b/thrust/detail/pointer.inl
index 66e7cdf36..de05ff20f 100644
--- a/thrust/detail/pointer.inl
+++ b/thrust/detail/pointer.inl
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2018 NVIDIA Corporation
+ *  Copyright 2008-2021 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -14,36 +14,29 @@
  *  limitations under the License.
  */
 
-#include <thrust/detail/config.h>
-#include <thrust/detail/pointer.h>
+#pragma once
 
+#include <thrust/detail/config.h>
 
-namespace thrust
-{
+#include <thrust/detail/pointer.h>
+#include <thrust/detail/type_traits.h>
 
+THRUST_NAMESPACE_BEGIN
 
 template<typename Element, typename Tag, typename Reference, typename Derived>
   __host__ __device__
   pointer<Element,Tag,Reference,Derived>
     ::pointer()
-      : super_t(static_cast<Element*>(
-          #if THRUST_CPP_DIALECT >= 2011
-          nullptr
-          #else
-          0
-          #endif
-        ))
+      : super_t(static_cast<Element*>(nullptr))
 {} // end pointer::pointer
 
 
-#if THRUST_CPP_DIALECT >= 2011
 template<typename Element, typename Tag, typename Reference, typename Derived>
   __host__ __device__
   pointer<Element,Tag,Reference,Derived>
-    ::pointer(decltype(nullptr))
+    ::pointer(std::nullptr_t)
       : super_t(static_cast<Element*>(nullptr))
 {} // end pointer::pointer
-#endif
 
 
 template<typename Element, typename Tag, typename Reference, typename Derived>
@@ -81,7 +74,6 @@ template<typename Element, typename Tag, typename Reference, typename Derived>
 {} // end pointer::pointer
 
 
-#if THRUST_CPP_DIALECT >= 2011
 template<typename Element, typename Tag, typename Reference, typename Derived>
   __host__ __device__
   typename pointer<Element,Tag,Reference,Derived>::derived_type &
@@ -91,7 +83,6 @@ template<typename Element, typename Tag, typename Reference, typename Derived>
   super_t::base_reference() = nullptr;
   return static_cast<derived_type&>(*this);
 } // end pointer::operator=
-#endif
 
 
 template<typename Element, typename Tag, typename Reference, typename Derived>
@@ -109,14 +100,43 @@ template<typename Element, typename Tag, typename Reference, typename Derived>
   return static_cast<derived_type&>(*this);
 } // end pointer::operator=
 
+namespace detail
+{
+
+// Implementation for dereference() when Reference is Element&,
+// e.g. cuda's managed_memory_pointer
+template <typename Reference, typename Derived>
+__host__ __device__
+Reference pointer_dereference_impl(const Derived& ptr,
+                                   thrust::detail::true_type /* is_cpp_ref */)
+{
+  return *ptr.get();
+}
+
+// Implementation for pointers with proxy references:
+template <typename Reference, typename Derived>
+__host__ __device__
+Reference pointer_dereference_impl(const Derived& ptr,
+                                   thrust::detail::false_type /* is_cpp_ref */)
+{
+  return Reference(ptr);
+}
+
+} // namespace detail
 
 template<typename Element, typename Tag, typename Reference, typename Derived>
   __host__ __device__
   typename pointer<Element,Tag,Reference,Derived>::super_t::reference
-    pointer<Element,Tag,Reference,Derived>
-      ::dereference() const
+  pointer<Element,Tag,Reference,Derived>
+    ::dereference() const
 {
-  return typename super_t::reference(static_cast<const derived_type&>(*this));
+  // Need to handle cpp refs and fancy refs differently:
+  typedef typename super_t::reference RefT;
+  typedef typename thrust::detail::is_reference<RefT>::type IsCppRef;
+
+  const derived_type& derivedPtr = static_cast<const derived_type&>(*this);
+
+  return detail::pointer_dereference_impl<RefT>(derivedPtr, IsCppRef());
 } // end pointer::dereference
 
 
@@ -129,7 +149,15 @@ template<typename Element, typename Tag, typename Reference, typename Derived>
 } // end pointer::get
 
 
-#if THRUST_CPP_DIALECT >= 2011
+template<typename Element, typename Tag, typename Reference, typename Derived>
+  __host__ __device__
+  Element *pointer<Element,Tag,Reference,Derived>
+    ::operator->() const
+{
+  return super_t::base();
+} // end pointer::operator->
+
+
 template<typename Element, typename Tag, typename Reference, typename Derived>
   __host__ __device__
   pointer<Element,Tag,Reference,Derived>
@@ -137,7 +165,6 @@ template<typename Element, typename Tag, typename Reference, typename Derived>
 {
   return bool(get());
 } // end pointer::operator bool
-#endif
 
 
 template<typename Element, typename Tag, typename Reference, typename Derived,
@@ -149,95 +176,34 @@ operator<<(std::basic_ostream<charT, traits> &os,
   return os << p.get();
 }
 
-#if THRUST_CPP_DIALECT >= 2011
 // NOTE: These are needed so that Thrust smart pointers work with
 // `std::unique_ptr`.
 template <typename Element, typename Tag, typename Reference, typename Derived>
 __host__ __device__
-bool operator==(decltype(nullptr), pointer<Element, Tag, Reference, Derived> p)
+bool operator==(std::nullptr_t, pointer<Element, Tag, Reference, Derived> p)
 {
   return nullptr == p.get();
 }
 
 template <typename Element, typename Tag, typename Reference, typename Derived>
 __host__ __device__
-bool operator==(pointer<Element, Tag, Reference, Derived> p, decltype(nullptr))
+bool operator==(pointer<Element, Tag, Reference, Derived> p, std::nullptr_t)
 {
   return nullptr == p.get();
 }
 
 template <typename Element, typename Tag, typename Reference, typename Derived>
 __host__ __device__
-bool operator!=(decltype(nullptr), pointer<Element, Tag, Reference, Derived> p)
+bool operator!=(std::nullptr_t, pointer<Element, Tag, Reference, Derived> p)
 {
   return !(nullptr == p);
 }
 
 template <typename Element, typename Tag, typename Reference, typename Derived>
 __host__ __device__
-bool operator!=(pointer<Element, Tag, Reference, Derived> p, decltype(nullptr))
+bool operator!=(pointer<Element, Tag, Reference, Derived> p, std::nullptr_t)
 {
   return !(nullptr == p);
 }
-#endif
-
-namespace detail
-{
-
-#if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC) && (_MSC_VER <= 1400)
-// XXX WAR MSVC 2005 problem with correctly implementing
-//     pointer_raw_pointer for pointer by specializing it here
-template<typename Element, typename Tag, typename Reference, typename Derived>
-  struct pointer_raw_pointer< thrust::pointer<Element,Tag,Reference,Derived> >
-{
-  typedef typename pointer<Element,Tag,Reference,Derived>::raw_pointer type;
-}; // end pointer_raw_pointer
-#endif
-
-
-#if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC) && (THRUST_GCC_VERSION < 40200)
-// XXX WAR g++-4.1 problem with correctly implementing
-//     pointer_element for pointer by specializing it here
-template<typename Element, typename Tag>
-  struct pointer_element< thrust::pointer<Element,Tag> >
-{
-  typedef Element type;
-}; // end pointer_element
-
-template<typename Element, typename Tag, typename Reference>
-  struct pointer_element< thrust::pointer<Element,Tag,Reference> >
-    : pointer_element< thrust::pointer<Element,Tag> >
-{}; // end pointer_element
-
-template<typename Element, typename Tag, typename Reference, typename Derived>
-  struct pointer_element< thrust::pointer<Element,Tag,Reference,Derived> >
-    : pointer_element< thrust::pointer<Element,Tag,Reference> >
-{}; // end pointer_element
-
-
-
-// XXX WAR g++-4.1 problem with correctly implementing
-//     rebind_pointer for pointer by specializing it here
-template<typename Element, typename Tag, typename NewElement>
-  struct rebind_pointer<thrust::pointer<Element,Tag>, NewElement>
-{
-  // XXX note we don't attempt to rebind the pointer's Reference type (or Derived)
-  typedef thrust::pointer<NewElement,Tag> type;
-};
-
-template<typename Element, typename Tag, typename Reference, typename NewElement>
-  struct rebind_pointer<thrust::pointer<Element,Tag,Reference>, NewElement>
-    : rebind_pointer<thrust::pointer<Element,Tag>, NewElement>
-{};
-
-template<typename Element, typename Tag, typename Reference, typename Derived, typename NewElement>
-  struct rebind_pointer<thrust::pointer<Element,Tag,Reference,Derived>, NewElement>
-    : rebind_pointer<thrust::pointer<Element,Tag,Reference>, NewElement>
-{};
-#endif
-
-} // end namespace detail
-
-
-} // end thrust
 
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/preprocessor.h b/thrust/detail/preprocessor.h
index 0e9943b76..2e850c764 100644
--- a/thrust/detail/preprocessor.h
+++ b/thrust/detail/preprocessor.h
@@ -947,7 +947,7 @@
   #define THRUST_PP_IIF_IMPL1(id) id
 #else
   #define THRUST_PP_IIF_IMPL0(bit, t, f)                                      \
-    THRUST_PP_CAT2(THRUST_PP_IIF_IMPL_TAG, bit(t, f))                         
+    THRUST_PP_CAT2(THRUST_PP_IIF_IMPL_TAG, bit(t, f))
     /**/
 #endif
 
@@ -1103,8 +1103,8 @@
   /**/
 
 /// \def THRUST_PP_DISPATCH(basename, ...)
-/// \brief Expands to <code>basenameN(...)</code>, where <code>N</code> is the
-///        number of variadic arguments that \a THRUST_PP_DISPATCH was called 
+/// \brief Expands to <tt>basenameN(...)</tt>, where <tt>N</tt> is the
+///        number of variadic arguments that \a THRUST_PP_DISPATCH was called
 ///        with. This macro can be used to implement "macro overloading".
 ///
 /// \par <b>Example</b>:
diff --git a/thrust/detail/range/head_flags.h b/thrust/detail/range/head_flags.h
index b193651cf..b755840c9 100644
--- a/thrust/detail/range/head_flags.h
+++ b/thrust/detail/range/head_flags.h
@@ -24,8 +24,7 @@
 #include <thrust/functional.h>
 
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 
@@ -226,5 +225,5 @@ head_flags<RandomAccessIterator>
 
 
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/range/tail_flags.h b/thrust/detail/range/tail_flags.h
index 32ccb53c6..41ee5dd29 100644
--- a/thrust/detail/range/tail_flags.h
+++ b/thrust/detail/range/tail_flags.h
@@ -23,8 +23,7 @@
 #include <thrust/tuple.h>
 #include <thrust/functional.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 
@@ -130,5 +129,5 @@ tail_flags<RandomAccessIterator>
 
 
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/raw_pointer_cast.h b/thrust/detail/raw_pointer_cast.h
index 33f87849d..53a77861e 100644
--- a/thrust/detail/raw_pointer_cast.h
+++ b/thrust/detail/raw_pointer_cast.h
@@ -19,8 +19,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/detail/type_traits/pointer_traits.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 template<typename Pointer>
 __host__ __device__
@@ -48,5 +47,4 @@ static_pointer_cast(FromPointer ptr)
   return ToPointer(static_cast<to_element*>(thrust::raw_pointer_cast(ptr)));
 }
 
-} // end thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/raw_reference_cast.h b/thrust/detail/raw_reference_cast.h
index a678144e2..eff45f0c2 100644
--- a/thrust/detail/raw_reference_cast.h
+++ b/thrust/detail/raw_reference_cast.h
@@ -29,8 +29,7 @@
 // raw_reference_cast depends on metafunctions such as is_unwrappable and raw_reference
 // we need to be sure that these metafunctions are completely defined (including specializations) before they are instantiated by raw_reference_cast
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 
@@ -48,26 +47,12 @@ template<typename T>
 
 // specialize is_unwrappable
 // a tuple is_unwrappable if any of its elements is_unwrappable
-template<
-  typename T0, typename T1, typename T2,
-  typename T3, typename T4, typename T5,
-  typename T6, typename T7, typename T8,
-  typename T9
->
+template<typename... Ts>
   struct is_unwrappable<
-    thrust::tuple<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9>
+    thrust::tuple<Ts...>
   >
     : or_<
-        is_unwrappable<T0>,
-        is_unwrappable<T1>,
-        is_unwrappable<T2>,
-        is_unwrappable<T3>,
-        is_unwrappable<T4>,
-        is_unwrappable<T5>,
-        is_unwrappable<T6>,
-        is_unwrappable<T7>,
-        is_unwrappable<T8>,
-        is_unwrappable<T9>
+        is_unwrappable<Ts>...
       >
 {};
 
@@ -75,25 +60,13 @@ template<
 // specialize is_unwrappable
 // a tuple_of_iterator_references is_unwrappable if any of its elements is_unwrappable
 template<
-  typename T0, typename T1, typename T2,
-  typename T3, typename T4, typename T5,
-  typename T6, typename T7, typename T8,
-  typename T9
+  typename... Ts
 >
   struct is_unwrappable<
-    thrust::detail::tuple_of_iterator_references<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9>
+    thrust::detail::tuple_of_iterator_references<Ts...>
   >
     : or_<
-        is_unwrappable<T0>,
-        is_unwrappable<T1>,
-        is_unwrappable<T2>,
-        is_unwrappable<T3>,
-        is_unwrappable<T4>,
-        is_unwrappable<T5>,
-        is_unwrappable<T6>,
-        is_unwrappable<T7>,
-        is_unwrappable<T8>,
-        is_unwrappable<T9>
+        is_unwrappable<Ts>...
       >
 {};
 
@@ -137,7 +110,7 @@ template<typename T>
 
 
 template<typename T>
-  struct raw_reference : 
+  struct raw_reference :
     raw_reference_detail::raw_reference_impl<T>
 {};
 
@@ -173,51 +146,27 @@ template<typename T>
 
 // recurse on tuples
 template <
-  typename T0, typename T1, typename T2,
-  typename T3, typename T4, typename T5,
-  typename T6, typename T7, typename T8,
-  typename T9
+  typename... Ts
 >
   struct raw_reference_tuple_helper<
-    thrust::tuple<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9>
+    thrust::tuple<Ts...>
   >
 {
   typedef thrust::tuple<
-    typename raw_reference_tuple_helper<T0>::type,
-    typename raw_reference_tuple_helper<T1>::type,
-    typename raw_reference_tuple_helper<T2>::type,
-    typename raw_reference_tuple_helper<T3>::type,
-    typename raw_reference_tuple_helper<T4>::type,
-    typename raw_reference_tuple_helper<T5>::type,
-    typename raw_reference_tuple_helper<T6>::type,
-    typename raw_reference_tuple_helper<T7>::type,
-    typename raw_reference_tuple_helper<T8>::type,
-    typename raw_reference_tuple_helper<T9>::type
+    typename raw_reference_tuple_helper<Ts>::type...
   > type;
 };
 
 
 template <
-  typename T0, typename T1, typename T2,
-  typename T3, typename T4, typename T5,
-  typename T6, typename T7, typename T8,
-  typename T9
+  typename... Ts
 >
   struct raw_reference_tuple_helper<
-    thrust::detail::tuple_of_iterator_references<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9>
+    thrust::detail::tuple_of_iterator_references<Ts...>
   >
 {
   typedef thrust::detail::tuple_of_iterator_references<
-    typename raw_reference_tuple_helper<T0>::type,
-    typename raw_reference_tuple_helper<T1>::type,
-    typename raw_reference_tuple_helper<T2>::type,
-    typename raw_reference_tuple_helper<T3>::type,
-    typename raw_reference_tuple_helper<T4>::type,
-    typename raw_reference_tuple_helper<T5>::type,
-    typename raw_reference_tuple_helper<T6>::type,
-    typename raw_reference_tuple_helper<T7>::type,
-    typename raw_reference_tuple_helper<T8>::type,
-    typename raw_reference_tuple_helper<T9>::type
+    typename raw_reference_tuple_helper<Ts>::type...
   > type;
 };
 
@@ -232,17 +181,14 @@ template <
 //   then the raw_reference of tuple_type is a tuple of its members' raw_references
 //   else the raw_reference of tuple_type is tuple_type &
 template <
-  typename T0, typename T1, typename T2,
-  typename T3, typename T4, typename T5,
-  typename T6, typename T7, typename T8,
-  typename T9
+  typename... Ts
 >
   struct raw_reference<
-    thrust::tuple<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9>
+    thrust::tuple<Ts...>
   >
 {
   private:
-    typedef thrust::tuple<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9> tuple_type;
+    typedef thrust::tuple<Ts...> tuple_type;
 
   public:
     typedef typename eval_if<
@@ -254,17 +200,14 @@ template <
 
 
 template <
-  typename T0, typename T1, typename T2,
-  typename T3, typename T4, typename T5,
-  typename T6, typename T7, typename T8,
-  typename T9
+  typename... Ts
 >
   struct raw_reference<
-    thrust::detail::tuple_of_iterator_references<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9>
+    thrust::detail::tuple_of_iterator_references<Ts...>
   >
 {
   private:
-    typedef detail::tuple_of_iterator_references<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9> tuple_type;
+    typedef detail::tuple_of_iterator_references<Ts...> tuple_type;
 
   public:
     typedef typename raw_reference_detail::raw_reference_tuple_helper<tuple_type>::type type;
@@ -295,19 +238,16 @@ typename detail::raw_reference<const T>::type
 
 
 template<
-  typename T0, typename T1, typename T2,
-  typename T3, typename T4, typename T5,
-  typename T6, typename T7, typename T8,
-  typename T9
+  typename... Ts
 >
 __host__ __device__
 typename detail::enable_if_unwrappable<
-  thrust::detail::tuple_of_iterator_references<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9>,
+  thrust::detail::tuple_of_iterator_references<Ts...>,
   typename detail::raw_reference<
-    thrust::detail::tuple_of_iterator_references<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9>
+    thrust::detail::tuple_of_iterator_references<Ts...>
   >::type
 >::type
-raw_reference_cast(thrust::detail::tuple_of_iterator_references<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9> t);
+raw_reference_cast(thrust::detail::tuple_of_iterator_references<Ts...> t);
 
 
 namespace detail
@@ -331,18 +271,15 @@ struct raw_reference_caster
   }
 
   template<
-    typename T0, typename T1, typename T2,
-    typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8,
-    typename T9
+    typename... Ts
   >
   __host__ __device__
   typename detail::raw_reference<
-    thrust::detail::tuple_of_iterator_references<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9>
+    thrust::detail::tuple_of_iterator_references<Ts...>
   >::type
-  operator()(thrust::detail::tuple_of_iterator_references<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9> t,
+  operator()(thrust::detail::tuple_of_iterator_references<Ts...> t,
              typename enable_if<
-               is_unwrappable<thrust::detail::tuple_of_iterator_references<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9> >::value
+               is_unwrappable<thrust::detail::tuple_of_iterator_references<Ts...> >::value
              >::type * = 0)
   {
     return thrust::raw_reference_cast(t);
@@ -372,19 +309,16 @@ typename detail::raw_reference<const T>::type
 
 
 template<
-  typename T0, typename T1, typename T2,
-  typename T3, typename T4, typename T5,
-  typename T6, typename T7, typename T8,
-  typename T9
+  typename... Ts
 >
 __host__ __device__
 typename detail::enable_if_unwrappable<
-  thrust::detail::tuple_of_iterator_references<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9>,
+  thrust::detail::tuple_of_iterator_references<Ts...>,
   typename detail::raw_reference<
-    thrust::detail::tuple_of_iterator_references<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9>
+    thrust::detail::tuple_of_iterator_references<Ts...>
   >::type
 >::type
-raw_reference_cast(thrust::detail::tuple_of_iterator_references<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9> t)
+raw_reference_cast(thrust::detail::tuple_of_iterator_references<Ts...> t)
 {
   thrust::detail::raw_reference_caster f;
 
@@ -394,5 +328,5 @@ raw_reference_cast(thrust::detail::tuple_of_iterator_references<T0,T1,T2,T3,T4,T
 } // end raw_reference_cast
 
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/reduce.inl b/thrust/detail/reduce.inl
index 2ecedc7a2..448a4b38c 100644
--- a/thrust/detail/reduce.inl
+++ b/thrust/detail/reduce.inl
@@ -14,10 +14,9 @@
  *  limitations under the License.
  */
 
+#pragma once
 
-/*! \file reduce.inl
- *  \brief Inline file for reduce.h.
- */
+#include <thrust/detail/config.h>
 
 #include <thrust/reduce.h>
 #include <thrust/iterator/iterator_traits.h>
@@ -27,8 +26,7 @@
 #include <thrust/system/detail/adl/reduce.h>
 #include <thrust/system/detail/adl/reduce_by_key.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 
 __thrust_exec_check_disable__
@@ -81,7 +79,7 @@ template<typename DerivedPolicy,
 __host__ __device__
   thrust::pair<OutputIterator1,OutputIterator2>
   reduce_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                InputIterator1 keys_first, 
+                InputIterator1 keys_first,
                 InputIterator1 keys_last,
                 InputIterator2 values_first,
                 OutputIterator1 keys_output,
@@ -102,7 +100,7 @@ template<typename DerivedPolicy,
 __host__ __device__
   thrust::pair<OutputIterator1,OutputIterator2>
   reduce_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                InputIterator1 keys_first, 
+                InputIterator1 keys_first,
                 InputIterator1 keys_last,
                 InputIterator2 values_first,
                 OutputIterator1 keys_output,
@@ -125,7 +123,7 @@ template<typename DerivedPolicy,
 __host__ __device__
   thrust::pair<OutputIterator1,OutputIterator2>
   reduce_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                InputIterator1 keys_first, 
+                InputIterator1 keys_first,
                 InputIterator1 keys_last,
                 InputIterator2 values_first,
                 OutputIterator1 keys_output,
@@ -192,7 +190,7 @@ template<typename InputIterator1,
          typename OutputIterator1,
          typename OutputIterator2>
   thrust::pair<OutputIterator1,OutputIterator2>
-  reduce_by_key(InputIterator1 keys_first, 
+  reduce_by_key(InputIterator1 keys_first,
                 InputIterator1 keys_last,
                 InputIterator2 values_first,
                 OutputIterator1 keys_output,
@@ -220,7 +218,7 @@ template<typename InputIterator1,
          typename OutputIterator2,
          typename BinaryPredicate>
   thrust::pair<OutputIterator1,OutputIterator2>
-  reduce_by_key(InputIterator1 keys_first, 
+  reduce_by_key(InputIterator1 keys_first,
                 InputIterator1 keys_last,
                 InputIterator2 values_first,
                 OutputIterator1 keys_output,
@@ -250,7 +248,7 @@ template<typename InputIterator1,
          typename BinaryPredicate,
          typename BinaryFunction>
   thrust::pair<OutputIterator1,OutputIterator2>
-  reduce_by_key(InputIterator1 keys_first, 
+  reduce_by_key(InputIterator1 keys_first,
                 InputIterator1 keys_last,
                 InputIterator2 values_first,
                 OutputIterator1 keys_output,
@@ -274,5 +272,5 @@ template<typename InputIterator1,
 }
 
 
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/reference.h b/thrust/detail/reference.h
index 5f492eec1..5cc13625d 100644
--- a/thrust/detail/reference.h
+++ b/thrust/detail/reference.h
@@ -14,161 +14,505 @@
  *  limitations under the License.
  */
 
+/*! \file 
+ *  \brief A pointer to a variable which resides in memory associated with a
+ *  system.
+ */
+
 #pragma once
 
 #include <thrust/detail/config.h>
-#include <thrust/detail/type_traits.h>
-#include <thrust/detail/use_default.h>
 #include <thrust/detail/reference_forward_declaration.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/generic/memory.h>
+#include <thrust/system/detail/adl/get_value.h>
+#include <thrust/system/detail/adl/assign_value.h>
+#include <thrust/system/detail/adl/iter_swap.h>
+#include <thrust/type_traits/remove_cvref.h>
+#include <type_traits>
 #include <ostream>
 
+THRUST_NAMESPACE_BEGIN
 
-namespace thrust
-{
 namespace detail
 {
-
-template<typename> struct is_wrapped_reference;
-
+template <typename>
+struct is_wrapped_reference;
 }
 
-// the base type for all of thrust's system-annotated references.
-// for reasonable reference-like semantics, derived types must reimplement the following:
-// 1. constructor from pointer
-// 2. copy constructor
-// 3. templated copy constructor from other reference
-// 4. templated assignment from other reference
-// 5. assignment from value_type
-template<typename Element, typename Pointer, typename Derived>
-  class reference
+/*! \p reference acts as a reference-like wrapper for an object residing in
+ *  memory that a \p pointer refers to.
+ */
+template <typename Element, typename Pointer, typename Derived>
+class reference
 {
-  private:
-    typedef typename thrust::detail::eval_if<
-      thrust::detail::is_same<Derived,use_default>::value,
-      thrust::detail::identity_<reference>,
-      thrust::detail::identity_<Derived>
-    >::type derived_type;
-
-    // hint for is_wrapped_reference lets it know that this type (or a derived type)
-    // is a wrapped reference
-    struct wrapped_reference_hint {};
-    template<typename> friend struct thrust::detail::is_wrapped_reference;
-
-  public:
-    typedef Pointer                                              pointer;
-    typedef typename thrust::detail::remove_const<Element>::type value_type;
-
-    __host__ __device__
-    explicit reference(const pointer &ptr);
-
-    template<typename OtherElement, typename OtherPointer, typename OtherDerived>
-    __host__ __device__
-    reference(const reference<OtherElement,OtherPointer,OtherDerived> &other,
-              typename thrust::detail::enable_if_convertible<
-                typename reference<OtherElement,OtherPointer,OtherDerived>::pointer,
-                pointer
-              >::type * = 0);
-
-    __host__ __device__
-    derived_type &operator=(const reference &other);
-
-    // XXX this may need an enable_if
-    template<typename OtherElement, typename OtherPointer, typename OtherDerived>
-    __host__ __device__
-    derived_type &operator=(const reference<OtherElement,OtherPointer,OtherDerived> &other);
-
-    __host__ __device__
-    derived_type &operator=(const value_type &x);
-
-    __host__ __device__
-    pointer operator&() const;
-
-    __host__ __device__
-    operator value_type () const;
-
-    __host__ __device__
-    void swap(derived_type &other);
-
-    derived_type &operator++();
-
-    value_type operator++(int);
-
-    // XXX parameterize the type of rhs
-    derived_type &operator+=(const value_type &rhs);
-
-    derived_type &operator--();
-
-    value_type operator--(int);
-
-    // XXX parameterize the type of rhs
-    derived_type &operator-=(const value_type &rhs);
-
-    // XXX parameterize the type of rhs
-    derived_type &operator*=(const value_type &rhs);
-
-    // XXX parameterize the type of rhs
-    derived_type &operator/=(const value_type &rhs);
-
-    // XXX parameterize the type of rhs
-    derived_type &operator%=(const value_type &rhs);
-
-    // XXX parameterize the type of rhs
-    derived_type &operator<<=(const value_type &rhs);
-
-    // XXX parameterize the type of rhs
-    derived_type &operator>>=(const value_type &rhs);
-
-    // XXX parameterize the type of rhs
-    derived_type &operator&=(const value_type &rhs);
-
-    // XXX parameterize the type of rhs
-    derived_type &operator|=(const value_type &rhs);
-
-    // XXX parameterize the type of rhs
-    derived_type &operator^=(const value_type &rhs);
-
-  private:
-    const pointer m_ptr;
-
-    // allow access to m_ptr for other references
-    template <typename OtherElement, typename OtherPointer, typename OtherDerived> friend class reference;
-
-    template<typename System>
-    __host__ __device__
-    inline value_type strip_const_get_value(const System &system) const;
-
-    template<typename OtherPointer>
-    __host__ __device__
-    inline void assign_from(OtherPointer src);
-
-    // XXX this helper exists only to avoid warnings about null references from the other assign_from
-    template<typename System1, typename System2, typename OtherPointer>
-    inline __host__ __device__
-    void assign_from(System1 *system1, System2 *system2, OtherPointer src);
-
-    template<typename System, typename OtherPointer>
-    __host__ __device__
-    inline void strip_const_assign_value(const System &system, OtherPointer src);
-
-    // XXX this helper exists only to avoid warnings about null references from the other swap
-    template<typename System>
-    inline __host__ __device__
-    void swap(System *system, derived_type &other);
-
-    // XXX this helper exists only to avoid warnings about null references from operator value_type ()
-    template<typename System>
-    inline __host__ __device__
-    value_type convert_to_value_type(System *system) const;
-}; // end reference
+private:
+  using derived_type = typename std::conditional<
+    std::is_same<Derived, use_default>::value, reference, Derived
+  >::type;
+
+public:
+  using pointer    = Pointer;
+  using value_type = typename thrust::remove_cvref<Element>::type;
+
+  reference(reference const&) = default;
+
+  reference(reference&&) = default;
+
+  /*! Construct a \p reference from another \p reference whose pointer type is
+   *  convertible to \p pointer. After this \p reference is constructed, it
+   *  shall refer to the same object as \p other.
+   *
+   *  \tparam OtherElement The element type of the other \p reference.
+   *  \tparam OtherPointer The pointer type of the other \p reference.
+   *  \tparam OtherDerived The derived type of the other \p reference.
+   *  \param  other        A \p reference to copy from.
+   */
+  template <typename OtherElement, typename OtherPointer, typename OtherDerived>
+  __host__ __device__
+  reference(
+    reference<OtherElement, OtherPointer, OtherDerived> const& other
+  /*! \cond
+   */
+  , typename std::enable_if<
+      std::is_convertible<
+        typename reference<OtherElement, OtherPointer, OtherDerived>::pointer
+      , pointer
+      >::value
+    >::type* = nullptr
+  /*! \endcond
+   */
+  )
+    : ptr(other.ptr)
+  {}
+
+  /*! Construct a \p reference that refers to an object pointed to by the given
+   *  \p pointer. After this \p reference is constructed, it shall refer to the
+   *  object pointed to by \p ptr.
+   *
+   *  \param ptr A \p pointer to construct from.
+   */
+  __host__ __device__
+  explicit reference(pointer const& p) : ptr(p) {}
+
+  /*! Assign the object referred to \p other to the object referred to by
+   *  this \p reference.
+   *
+   *  \param other The other \p reference to assign from.
+   *
+   *  \return <tt>*this</tt>.
+   */
+  __host__ __device__
+  derived_type& operator=(reference const& other)
+  {
+    assign_from(&other);
+    return derived();
+  }
+
+  /*! Assign the object referred to by this \p reference with the object
+   *  referred to by another \p reference whose pointer type is convertible to
+   *  \p pointer.
+   *
+   *  \tparam OtherElement The element type of the other \p reference.
+   *  \tparam OtherPointer The pointer type of the other \p reference.
+   *  \tparam OtherDerived The derived type of the other \p reference.
+   *  \param  other        The other \p reference to assign from.
+   *
+   *  \return <tt>*this</tt>.
+   */
+  template <typename OtherElement, typename OtherPointer, typename OtherDerived>
+  __host__ __device__
+  /*! \cond
+   */
+  typename std::enable_if<
+    std::is_convertible<
+      typename reference<OtherElement, OtherPointer, OtherDerived>::pointer
+    , pointer
+    >::value,
+  /*! \endcond
+   */
+    derived_type&
+  /*! \cond
+   */
+  >::type
+  /*! \endcond
+   */
+  operator=(reference<OtherElement, OtherPointer, OtherDerived> const& other)
+  {
+    assign_from(&other);
+    return derived();
+  }
+
+  /*! Assign \p rhs to the object referred to by this \p tagged_reference.
+   *
+   *  \param rhs The \p value_type to assign from.
+   *
+   *  \return <tt>*this</tt>.
+   */
+  __host__ __device__
+  derived_type& operator=(value_type const& rhs)
+  {
+    assign_from(&rhs);
+    return derived();
+  }
+
+  /*! Exchanges the value of the object referred to by this \p tagged_reference
+   *  with the object referred to by \p other.
+   *
+   *  \param other The \p tagged_reference to swap with.
+   */
+  __host__ __device__
+  void swap(derived_type& other)
+  {
+    // Avoid default-constructing a system; instead, just use a null pointer
+    // for dispatch. This assumes that `get_value` will not access any system
+    // state.
+    typename thrust::iterator_system<pointer>::type* system = nullptr;
+    swap(system, other);
+  }
+
+  __host__ __device__ pointer operator&() const { return ptr; }
+
+  // This is inherently hazardous, as it discards the strong type information
+  // about what system the object is on.
+  __host__ __device__ operator value_type() const
+  {
+    // Avoid default-constructing a system; instead, just use a null pointer
+    // for dispatch. This assumes that `get_value` will not access any system
+    // state.
+    typename thrust::iterator_system<pointer>::type* system = nullptr;
+    return convert_to_value_type(system);
+  }
+
+  __host__ __device__
+  derived_type& operator++()
+  {
+    // Sadly, this has to make a copy. The only mechanism we have for
+    // modifying the value, which may be in memory inaccessible to this
+    // system, is to get a copy of it, modify the copy, and then update it.
+    value_type tmp = *this;
+    ++tmp;
+    *this = tmp;
+    return derived();
+  }
+
+  __host__ __device__
+  value_type operator++(int)
+  {
+    value_type tmp = *this;
+    value_type result = tmp++;
+    *this = std::move(tmp);
+    return result;
+  }
+
+  derived_type& operator--()
+  {
+    // Sadly, this has to make a copy. The only mechanism we have for
+    // modifying the value, which may be in memory inaccessible to this
+    // system, is to get a copy of it, modify the copy, and then update it.
+    value_type tmp = *this;
+    --tmp;
+    *this = std::move(tmp);
+    return derived();
+  }
+
+  value_type operator--(int)
+  {
+    value_type tmp = *this;
+    value_type result = tmp--;
+    *this = std::move(tmp);
+    return derived();
+  }
+
+  __host__ __device__
+  derived_type& operator+=(value_type const& rhs)
+  {
+    value_type tmp = *this;
+    tmp += rhs;
+    *this = tmp;
+    return derived();
+  }
+
+  derived_type& operator-=(value_type const& rhs)
+  {
+    value_type tmp = *this;
+    tmp -= rhs;
+    *this = tmp;
+    return derived();
+  }
+
+  derived_type& operator*=(value_type const& rhs)
+  {
+    value_type tmp = *this;
+    tmp *= rhs;
+    *this = tmp;
+    return derived();
+  }
+
+  derived_type& operator/=(value_type const& rhs)
+  {
+    value_type tmp = *this;
+    tmp /= rhs;
+    *this = tmp;
+    return derived();
+  }
+
+  derived_type& operator%=(value_type const& rhs)
+  {
+    value_type tmp = *this;
+    tmp %= rhs;
+    *this = tmp;
+    return derived();
+  }
+
+  derived_type& operator<<=(value_type const& rhs)
+  {
+    value_type tmp = *this;
+    tmp <<= rhs;
+    *this = tmp;
+    return derived();
+  }
+
+  derived_type& operator>>=(value_type const& rhs)
+  {
+    value_type tmp = *this;
+    tmp >>= rhs;
+    *this = tmp;
+    return derived();
+  }
+
+  derived_type& operator&=(value_type const& rhs)
+  {
+    value_type tmp = *this;
+    tmp &= rhs;
+    *this = tmp;
+    return derived();
+  }
+
+  derived_type& operator|=(value_type const& rhs)
+  {
+    value_type tmp = *this;
+    tmp |= rhs;
+    *this = tmp;
+    return derived();
+  }
+
+  derived_type& operator^=(value_type const& rhs)
+  {
+    value_type tmp = *this;
+    tmp ^= rhs;
+    *this = tmp;
+    return derived();
+  }
+
+private:
+  pointer const ptr;
+
+  // `thrust::detail::is_wrapped_reference` is a trait that indicates whether
+  // a type is a fancy reference. It detects such types by loooking for a
+  // nested `wrapped_reference_hint` type.
+  struct wrapped_reference_hint {};
+  template <typename>
+  friend struct thrust::detail::is_wrapped_reference;
+
+  template <typename OtherElement, typename OtherPointer, typename OtherDerived>
+  friend class reference;
+
+  __host__ __device__
+  derived_type& derived() { return static_cast<derived_type&>(*this); }
+
+  template<typename System>
+  __host__ __device__
+  value_type convert_to_value_type(System* system) const
+  {
+    using thrust::system::detail::generic::select_system;
+    return strip_const_get_value(select_system(*system));
+  }
+
+  template <typename System>
+  __host__ __device__
+  value_type strip_const_get_value(System const& system) const
+  {
+    System &non_const_system = const_cast<System&>(system);
+
+    using thrust::system::detail::generic::get_value;
+    return get_value(thrust::detail::derived_cast(non_const_system), ptr);
+  }
+
+  template <typename System0, typename System1, typename OtherPointer>
+  __host__ __device__
+  void assign_from(System0* system0, System1* system1, OtherPointer src)
+  {
+    using thrust::system::detail::generic::select_system;
+    strip_const_assign_value(select_system(*system0, *system1), src);
+  }
+
+  template <typename OtherPointer>
+  __host__ __device__
+  void assign_from(OtherPointer src)
+  {
+    // Avoid default-constructing systems; instead, just use a null pointer
+    // for dispatch. This assumes that `get_value` will not access any system
+    // state.
+    typename thrust::iterator_system<pointer>::type*      system0 = nullptr;
+    typename thrust::iterator_system<OtherPointer>::type* system1 = nullptr;
+    assign_from(system0, system1, src);
+  }
+
+  template <typename System, typename OtherPointer>
+  __host__ __device__
+  void strip_const_assign_value(System const& system, OtherPointer src)
+  {
+    System& non_const_system = const_cast<System&>(system);
+
+    using thrust::system::detail::generic::assign_value;
+    assign_value(thrust::detail::derived_cast(non_const_system), ptr, src);
+  }
+
+  template <typename System>
+  __host__ __device__
+  void swap(System* system, derived_type& other)
+  {
+    using thrust::system::detail::generic::select_system;
+    using thrust::system::detail::generic::iter_swap;
+
+    iter_swap(select_system(*system, *system), ptr, other.ptr);
+  }
+};
+
+template <typename Pointer, typename Derived>
+class reference<void, Pointer, Derived> {};
+
+template <typename Pointer, typename Derived>
+class reference<void const, Pointer, Derived> {};
+
+template <
+  typename Element, typename Pointer, typename Derived
+, typename CharT, typename Traits
+>
+std::basic_ostream<CharT, Traits>& operator<<(
+  std::basic_ostream<CharT, Traits>&os
+, reference<Element, Pointer, Derived> const& r
+) {
+  using value_type = typename reference<Element, Pointer, Derived>::value_type;
+  return os << static_cast<value_type>(r);
+}
 
-// Output stream operator
-template<typename Element, typename Pointer, typename Derived,
-         typename charT, typename traits>
-std::basic_ostream<charT, traits> &
-operator<<(std::basic_ostream<charT, traits> &os,
-           const reference<Element, Pointer, Derived> &y);
+template <typename Element, typename Tag>
+class tagged_reference;
 
-} // end thrust
+/*! \p tagged_reference acts as a reference-like wrapper for an object residing
+ *  in memory associated with system \p Tag that a \p pointer refers to.
+ */
+template <typename Element, typename Tag>
+class tagged_reference
+  : public thrust::reference<
+      Element
+    , thrust::pointer<Element, Tag, tagged_reference<Element, Tag>>
+    , tagged_reference<Element, Tag>
+    >
+{
+private:
+  using base_type = thrust::reference<
+    Element
+  , thrust::pointer<Element, Tag, tagged_reference<Element, Tag>>
+  , tagged_reference<Element, Tag>
+  >;
+
+public:
+  using value_type = typename base_type::value_type;
+  using pointer    = typename base_type::pointer;
+
+  tagged_reference(tagged_reference const&) = default;
+
+  tagged_reference(tagged_reference&&) = default;
+
+  /*! Construct a \p tagged_reference from another \p tagged_reference whose
+   *  pointer type is convertible to \p pointer. After this \p tagged_reference
+   *  is constructed, it shall refer to the same object as \p other.
+   *
+   *  \tparam OtherElement The element type of the other \p tagged_reference.
+   *  \tparam OtherTag     The tag type of the other \p tagged_reference.
+   *  \param  other        A \p tagged_reference to copy from.
+   */
+  template <typename OtherElement, typename OtherTag>
+  __host__ __device__
+  tagged_reference(tagged_reference<OtherElement, OtherTag> const& other)
+    : base_type(other)
+  {}
+
+  /*! Construct a \p tagged_reference that refers to an object pointed to by
+   *  the given \p pointer. After this \p tagged_reference is constructed, it
+   *  shall refer to the object pointed to by \p ptr.
+   *
+   *  \param ptr A \p pointer to construct from.
+   */
+  __host__ __device__ explicit tagged_reference(pointer const& p)
+    : base_type(p)
+  {}
+
+  /*! Assign the object referred to \p other to the object referred to by
+   *  this \p tagged_reference.
+   *
+   *  \param other The other \p tagged_reference to assign from.
+   *
+   *  \return <tt>*this</tt>.
+   */
+  __host__ __device__
+  tagged_reference& operator=(tagged_reference const& other)
+  {
+    return base_type::operator=(other);
+  }
+
+  /*! Assign the object referred to by this \p tagged_reference with the object
+   *  referred to by another \p tagged_reference whose pointer type is
+   *  convertible to \p pointer.
+   *
+   *  \tparam OtherElement The element type of the other \p tagged_reference.
+   *  \tparam OtherTag     The tag type of the other \p tagged_reference.
+   *  \param  other        The other \p tagged_reference to assign from.
+   *
+   *  \return <tt>*this</tt>.
+   */
+  template <typename OtherElement, typename OtherTag>
+  __host__ __device__
+  tagged_reference&
+  operator=(tagged_reference<OtherElement, OtherTag> const& other)
+  {
+    return base_type::operator=(other);
+  }
+
+  /*! Assign \p rhs to the object referred to by this \p tagged_reference.
+   *
+   *  \param rhs The \p value_type to assign from.
+   *
+   *  \return <tt>*this</tt>.
+   */
+  __host__ __device__
+  tagged_reference& operator=(value_type const& rhs)
+  {
+    return base_type::operator=(rhs);
+  }
+};
+
+template <typename Tag>
+class tagged_reference<void, Tag> {};
+
+template <typename Tag>
+class tagged_reference<void const, Tag> {};
+
+/*! Exchanges the values of two objects referred to by \p tagged_reference.
+ *
+ *  \param x The first \p tagged_reference of interest.
+ *  \param y The second \p tagged_reference of interest.
+ */
+template <typename Element, typename Tag>
+__host__ __device__
+void swap(tagged_reference<Element, Tag>& x, tagged_reference<Element, Tag>& y)
+{
+  x.swap(y);
+}
 
-#include <thrust/detail/reference.inl>
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/reference.inl b/thrust/detail/reference.inl
deleted file mode 100644
index 91f2b9736..000000000
--- a/thrust/detail/reference.inl
+++ /dev/null
@@ -1,382 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/reference.h>
-#include <thrust/detail/type_traits.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/system/detail/generic/select_system.h>
-#include <thrust/system/detail/generic/memory.h>
-#include <thrust/system/detail/adl/get_value.h>
-#include <thrust/system/detail/adl/assign_value.h>
-#include <thrust/system/detail/adl/iter_swap.h>
-
-
-namespace thrust
-{
-
-
-template<typename Element, typename Pointer, typename Derived>
-  template<typename OtherElement, typename OtherPointer, typename OtherDerived>
-    __host__ __device__
-    reference<Element,Pointer,Derived>
-      ::reference(const reference<OtherElement,OtherPointer,OtherDerived> &other,
-                  typename thrust::detail::enable_if_convertible<
-                    typename reference<OtherElement,OtherPointer,OtherDerived>::pointer,
-                    pointer
-                  >::type *)
-        : m_ptr(other.m_ptr)
-{}
-
-
-template<typename Element, typename Pointer, typename Derived>
-  __host__ __device__
-  reference<Element,Pointer,Derived>
-    ::reference(const pointer &ptr)
-      : m_ptr(ptr)
-{}
-
-
-template<typename Element, typename Pointer, typename Derived>
-  __host__ __device__
-  typename reference<Element,Pointer,Derived>::pointer
-    reference<Element,Pointer,Derived>
-      ::operator&() const
-{
-  return m_ptr;
-} // end reference::operator&()
-
-
-template<typename Element, typename Pointer, typename Derived>
-  __host__ __device__
-  typename reference<Element,Pointer,Derived>::derived_type &
-    reference<Element,Pointer,Derived>
-      ::operator=(const value_type &v)
-{
-  assign_from(&v);
-  return static_cast<derived_type&>(*this);
-} // end reference::operator=()
-
-
-template<typename Element, typename Pointer, typename Derived>
-  __host__ __device__
-  typename reference<Element,Pointer,Derived>::derived_type &
-    reference<Element,Pointer,Derived>
-      ::operator=(const reference &other)
-{
-  assign_from(&other); 
-  return static_cast<derived_type&>(*this);
-} // end reference::operator=()
-
-
-template<typename Element, typename Pointer, typename Derived>
-  template<typename OtherElement, typename OtherPointer, typename OtherDerived>
-    __host__ __device__
-    typename reference<Element,Pointer,Derived>::derived_type &
-      reference<Element,Pointer,Derived>
-        ::operator=(const reference<OtherElement,OtherPointer,OtherDerived> &other)
-{
-  assign_from(&other);
-  return static_cast<derived_type&>(*this);
-} // end reference::operator=()
-
-
-template<typename Element, typename Pointer, typename Derived>
-  template<typename System>
-    __host__ __device__
-    typename reference<Element,Pointer,Derived>::value_type
-      reference<Element,Pointer,Derived>
-        ::convert_to_value_type(System *system) const
-{
-  using thrust::system::detail::generic::select_system;
-  return strip_const_get_value(select_system(*system));
-} // end convert_to_value_type()
-
-
-template<typename Element, typename Pointer, typename Derived>
-  __host__ __device__
-  reference<Element,Pointer,Derived>
-    ::operator typename reference<Element,Pointer,Derived>::value_type () const
-{
-  typedef typename thrust::iterator_system<pointer>::type System;
-
-  // XXX avoid default-constructing a system
-  // XXX use null a reference for dispatching
-  // XXX this assumes that the eventual invocation of
-  // XXX get_value will not access system state
-  System *system = 0;
-
-  return convert_to_value_type(system);
-} // end reference::operator value_type ()
-
-
-template<typename Element, typename Pointer, typename Derived>
-  template<typename System>
-    __host__ __device__
-    typename reference<Element,Pointer,Derived>::value_type
-      reference<Element,Pointer,Derived>
-        ::strip_const_get_value(const System &system) const
-{
-  System &non_const_system = const_cast<System&>(system);
-
-  using thrust::system::detail::generic::get_value;
-
-  return get_value(thrust::detail::derived_cast(non_const_system), m_ptr);
-} // end reference::strip_const_get_value()
-
-
-template<typename Element, typename Pointer, typename Derived>
-  template<typename System1, typename System2, typename OtherPointer>
-    __host__ __device__
-    void reference<Element,Pointer,Derived>
-      ::assign_from(System1 *system1, System2 *system2, OtherPointer src)
-{
-  using thrust::system::detail::generic::select_system;
-
-  strip_const_assign_value(select_system(*system1, *system2), src);
-} // end assign_from()
-
-
-template<typename Element, typename Pointer, typename Derived>
-  template<typename OtherPointer>
-    __host__ __device__
-    void reference<Element,Pointer,Derived>
-      ::assign_from(OtherPointer src)
-{
-  typedef typename thrust::iterator_system<pointer>::type      System1;
-  typedef typename thrust::iterator_system<OtherPointer>::type System2;
-
-  // XXX avoid default-constructing a system
-  // XXX use null references for dispatching
-  // XXX this assumes that the eventual invocation of
-  // XXX assign_value will not access system state
-  System1 *system1 = 0;
-  System2 *system2 = 0;
-
-  assign_from(system1, system2, src);
-} // end assign_from()
-
-
-template<typename Element, typename Pointer, typename Derived>
-  template<typename System, typename OtherPointer>
-    __host__ __device__
-    void reference<Element,Pointer,Derived>
-      ::strip_const_assign_value(const System &system, OtherPointer src)
-{
-  System &non_const_system = const_cast<System&>(system);
-
-  using thrust::system::detail::generic::assign_value;
-
-  assign_value(thrust::detail::derived_cast(non_const_system), m_ptr, src);
-} // end strip_const_assign_value()
-
-
-template<typename Element, typename Pointer, typename Derived>
-  template<typename System>
-    __host__ __device__
-    void reference<Element,Pointer,Derived>
-      ::swap(System *system, derived_type &other)
-{
-  using thrust::system::detail::generic::select_system;
-  using thrust::system::detail::generic::iter_swap;
-
-  iter_swap(select_system(*system, *system), m_ptr, other.m_ptr);
-} // end reference::swap()
-
-
-template<typename Element, typename Pointer, typename Derived>
-  __host__ __device__
-  void reference<Element,Pointer,Derived>
-    ::swap(derived_type &other)
-{
-  typedef typename thrust::iterator_system<pointer>::type System;
-
-  // XXX avoid default-constructing a system
-  // XXX use null references for dispatching
-  // XXX this assumes that the eventual invocation
-  // XXX of iter_swap will not access system state
-  System *system = 0;
-
-  swap(system, other);
-} // end reference::swap()
-
-
-template<typename Element, typename Pointer, typename Derived>
-  typename reference<Element,Pointer,Derived>::derived_type &
-    reference<Element,Pointer,Derived>
-      ::operator++(void)
-{
-  value_type temp = *this;
-  ++temp;
-  *this = temp;
-  return static_cast<derived_type&>(*this);
-} // end reference::operator++()
-
-
-template<typename Element, typename Pointer, typename Derived>
-  typename reference<Element,Pointer,Derived>::value_type
-    reference<Element,Pointer,Derived>
-      ::operator++(int)
-{
-  value_type temp = *this;
-  value_type result = temp++;
-  *this = temp;
-  return result;
-} // end reference::operator++()
-
-
-template<typename Element, typename Pointer, typename Derived>
-  typename reference<Element,Pointer,Derived>::derived_type &
-    reference<Element,Pointer,Derived>
-      ::operator+=(const value_type &rhs)
-{
-  value_type temp = *this;
-  temp += rhs;
-  *this = temp;
-  return static_cast<derived_type&>(*this);
-} // end reference::operator+=()
-
-template<typename Element, typename Pointer, typename Derived>
-  typename reference<Element,Pointer,Derived>::derived_type &
-    reference<Element,Pointer,Derived>
-      ::operator--(void)
-{
-  value_type temp = *this;
-  --temp;
-  *this = temp;
-  return static_cast<derived_type&>(*this);
-} // end reference::operator--()
-
-template<typename Element, typename Pointer, typename Derived>
-  typename reference<Element,Pointer,Derived>::value_type
-    reference<Element,Pointer,Derived>
-      ::operator--(int)
-{
-  value_type temp = *this;
-  value_type result = temp--;
-  *this = temp;
-  return result;
-} // end reference::operator--()
-
-template<typename Element, typename Pointer, typename Derived>
-  typename reference<Element,Pointer,Derived>::derived_type &
-    reference<Element,Pointer,Derived>
-      ::operator-=(const value_type &rhs)
-{
-  value_type temp = *this;
-  temp -= rhs;
-  *this = temp;
-  return static_cast<derived_type&>(*this);
-} // end reference::operator-=()
-
-template<typename Element, typename Pointer, typename Derived>
-  typename reference<Element,Pointer,Derived>::derived_type &
-    reference<Element,Pointer,Derived>
-      ::operator*=(const value_type &rhs)
-{
-  value_type temp = *this;
-  temp *= rhs;
-  *this = temp;
-  return static_cast<derived_type&>(*this);
-} // end reference::operator*=()
-
-template<typename Element, typename Pointer, typename Derived>
-  typename reference<Element,Pointer,Derived>::derived_type &
-    reference<Element,Pointer,Derived>
-      ::operator/=(const value_type &rhs)
-{
-  value_type temp = *this;
-  temp /= rhs;
-  *this = temp;
-  return static_cast<derived_type&>(*this);
-} // end reference::operator/=()
-
-template<typename Element, typename Pointer, typename Derived>
-  typename reference<Element,Pointer,Derived>::derived_type &
-    reference<Element,Pointer,Derived>
-      ::operator%=(const value_type &rhs)
-{
-  value_type temp = *this;
-  temp %= rhs;
-  *this = temp;
-  return static_cast<derived_type&>(*this);
-} // end reference::operator%=()
-
-template<typename Element, typename Pointer, typename Derived>
-  typename reference<Element,Pointer,Derived>::derived_type &
-    reference<Element,Pointer,Derived>
-      ::operator<<=(const value_type &rhs)
-{
-  value_type temp = *this;
-  temp <<= rhs;
-  *this = temp;
-  return static_cast<derived_type&>(*this);
-} // end reference::operator<<=()
-
-template<typename Element, typename Pointer, typename Derived>
-  typename reference<Element,Pointer,Derived>::derived_type &
-    reference<Element,Pointer,Derived>
-      ::operator>>=(const value_type &rhs)
-{
-  value_type temp = *this;
-  temp >>= rhs;
-  *this = temp;
-  return static_cast<derived_type&>(*this);
-} // end reference::operator>>=()
-
-template<typename Element, typename Pointer, typename Derived>
-  typename reference<Element,Pointer,Derived>::derived_type &
-    reference<Element,Pointer,Derived>
-      ::operator&=(const value_type &rhs)
-{
-  value_type temp = *this;
-  temp &= rhs;
-  *this = temp;
-  return static_cast<derived_type&>(*this);
-} // end reference::operator&=()
-
-template<typename Element, typename Pointer, typename Derived>
-  typename reference<Element,Pointer,Derived>::derived_type &
-    reference<Element,Pointer,Derived>
-      ::operator|=(const value_type &rhs)
-{
-  value_type temp = *this;
-  temp |= rhs;
-  *this = temp;
-  return static_cast<derived_type&>(*this);
-} // end reference::operator|=()
-
-template<typename Element, typename Pointer, typename Derived>
-  typename reference<Element,Pointer,Derived>::derived_type &
-    reference<Element,Pointer,Derived>
-      ::operator^=(const value_type &rhs)
-{
-  value_type temp = *this;
-  temp ^= rhs;
-  *this = temp;
-  return static_cast<derived_type&>(*this);
-} // end reference::operator^=()
-
-template<typename Element, typename Pointer, typename Derived,
-         typename charT, typename traits>
-std::basic_ostream<charT, traits> &
-operator<<(std::basic_ostream<charT, traits> &os,
-           const reference<Element, Pointer, Derived> &y) {
-  typedef typename reference<Element, Pointer, Derived>::value_type value_type;
-  return os << static_cast<value_type>(y);
-} // end operator<<()
-
-} // end thrust
diff --git a/thrust/detail/reference_forward_declaration.h b/thrust/detail/reference_forward_declaration.h
index a8912ca43..6f2b99949 100644
--- a/thrust/detail/reference_forward_declaration.h
+++ b/thrust/detail/reference_forward_declaration.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2020 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -19,10 +19,10 @@
 #include <thrust/detail/config.h>
 #include <thrust/detail/use_default.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
-template<typename Element, typename Pointer, typename Derived = use_default> class reference;
+template <typename Element, typename Pointer, typename Derived = use_default>
+class reference;
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/remove.inl b/thrust/detail/remove.inl
index f5951fa91..7ccc0cc46 100644
--- a/thrust/detail/remove.inl
+++ b/thrust/detail/remove.inl
@@ -14,10 +14,7 @@
  *  limitations under the License.
  */
 
-
-/*! \file remove.inl
- *  \brief Inline file for remove.h.
- */
+#pragma once
 
 #include <thrust/detail/config.h>
 #include <thrust/remove.h>
@@ -26,8 +23,7 @@
 #include <thrust/system/detail/generic/remove.h>
 #include <thrust/system/detail/adl/remove.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 
 __thrust_exec_check_disable__
@@ -246,5 +242,5 @@ template<typename InputIterator1,
 } // end remove_copy_if()
 
 
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/replace.inl b/thrust/detail/replace.inl
index de5bff4d5..629287bee 100644
--- a/thrust/detail/replace.inl
+++ b/thrust/detail/replace.inl
@@ -14,10 +14,7 @@
  *  limitations under the License.
  */
 
-
-/*! \file replace.inl
- *  \brief Inline file for replace.h.
- */
+#pragma once
 
 #include <thrust/detail/config.h>
 #include <thrust/replace.h>
@@ -26,8 +23,7 @@
 #include <thrust/system/detail/generic/replace.h>
 #include <thrust/system/detail/adl/replace.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 
 __thrust_exec_check_disable__
@@ -218,5 +214,5 @@ template<typename ForwardIterator, typename T>
 } // end replace()
 
 
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/reverse.inl b/thrust/detail/reverse.inl
index e8a018cd6..dc316d18f 100644
--- a/thrust/detail/reverse.inl
+++ b/thrust/detail/reverse.inl
@@ -14,10 +14,7 @@
  *  limitations under the License.
  */
 
-
-/*! \file reverse.inl
- *  \brief Inline file for reverse.h.
- */
+#pragma once
 
 #include <thrust/detail/config.h>
 #include <thrust/reverse.h>
@@ -26,8 +23,7 @@
 #include <thrust/system/detail/generic/reverse.h>
 #include <thrust/system/detail/adl/reverse.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 
 __thrust_exec_check_disable__
@@ -87,5 +83,5 @@ template<typename BidirectionalIterator,
 } // end reverse_copy()
 
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/scan.inl b/thrust/detail/scan.inl
index 5329d1118..b781b0e28 100644
--- a/thrust/detail/scan.inl
+++ b/thrust/detail/scan.inl
@@ -14,10 +14,7 @@
  *  limitations under the License.
  */
 
-
-/*! \file scan.inl
- *  \brief Inline file for scan.h.
- */
+#pragma once
 
 #include <thrust/detail/config.h>
 #include <thrust/scan.h>
@@ -28,8 +25,7 @@
 #include <thrust/system/detail/adl/scan.h>
 #include <thrust/system/detail/adl/scan_by_key.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 
 __thrust_exec_check_disable__
@@ -44,7 +40,7 @@ __host__ __device__
 {
   using thrust::system::detail::generic::inclusive_scan;
   return inclusive_scan(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result);
-} // end inclusive_scan() 
+} // end inclusive_scan()
 
 
 __thrust_exec_check_disable__
@@ -522,5 +518,5 @@ template<typename InputIterator1,
 }
 
 
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/scatter.inl b/thrust/detail/scatter.inl
index 50ca8f3aa..30dd611d1 100644
--- a/thrust/detail/scatter.inl
+++ b/thrust/detail/scatter.inl
@@ -14,10 +14,9 @@
  *  limitations under the License.
  */
 
+#pragma once
 
-/*! \file scatter.inl
- *  \brief Inline file for scatter.h.
- */
+#include <thrust/detail/config.h>
 
 #include <thrust/scatter.h>
 #include <thrust/iterator/iterator_traits.h>
@@ -25,8 +24,7 @@
 #include <thrust/system/detail/generic/scatter.h>
 #include <thrust/system/detail/adl/scatter.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 
 __thrust_exec_check_disable__
@@ -96,9 +94,9 @@ template<typename InputIterator1,
 {
   using thrust::system::detail::generic::select_system;
 
-  typedef typename thrust::iterator_system<InputIterator1>::type       System1; 
-  typedef typename thrust::iterator_system<InputIterator2>::type       System2; 
-  typedef typename thrust::iterator_system<RandomAccessIterator>::type System3; 
+  typedef typename thrust::iterator_system<InputIterator1>::type       System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type       System2;
+  typedef typename thrust::iterator_system<RandomAccessIterator>::type System3;
 
   System1 system1;
   System2 system2;
@@ -120,10 +118,10 @@ template<typename InputIterator1,
 {
   using thrust::system::detail::generic::select_system;
 
-  typedef typename thrust::iterator_system<InputIterator1>::type       System1; 
-  typedef typename thrust::iterator_system<InputIterator2>::type       System2; 
-  typedef typename thrust::iterator_system<InputIterator3>::type       System3; 
-  typedef typename thrust::iterator_system<RandomAccessIterator>::type System4; 
+  typedef typename thrust::iterator_system<InputIterator1>::type       System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type       System2;
+  typedef typename thrust::iterator_system<InputIterator3>::type       System3;
+  typedef typename thrust::iterator_system<RandomAccessIterator>::type System4;
 
   System1 system1;
   System2 system2;
@@ -148,10 +146,10 @@ template<typename InputIterator1,
 {
   using thrust::system::detail::generic::select_system;
 
-  typedef typename thrust::iterator_system<InputIterator1>::type       System1; 
-  typedef typename thrust::iterator_system<InputIterator2>::type       System2; 
-  typedef typename thrust::iterator_system<InputIterator3>::type       System3; 
-  typedef typename thrust::iterator_system<RandomAccessIterator>::type System4; 
+  typedef typename thrust::iterator_system<InputIterator1>::type       System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type       System2;
+  typedef typename thrust::iterator_system<InputIterator3>::type       System3;
+  typedef typename thrust::iterator_system<RandomAccessIterator>::type System4;
 
   System1 system1;
   System2 system2;
@@ -161,6 +159,5 @@ template<typename InputIterator1,
   return thrust::scatter_if(select_system(system1,system2,system3,system4), first, last, map, stencil, output, pred);
 } // end scatter_if()
 
-
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/select_system.h b/thrust/detail/select_system.h
index dd07a28d1..968446162 100644
--- a/thrust/detail/select_system.h
+++ b/thrust/detail/select_system.h
@@ -25,7 +25,7 @@
 #include <thrust/type_traits/remove_cvref.h>
 #include <thrust/system/detail/generic/select_system.h>
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
 
 namespace detail
 {
@@ -78,7 +78,7 @@ THRUST_INLINE_CONSTANT select_system_detail::select_system_fn select_system{};
 
 } // detail
 
-THRUST_END_NS
+THRUST_NAMESPACE_END
 
 #endif // THRUST_CPP_DIALECT >= 2011
 
diff --git a/thrust/detail/seq.h b/thrust/detail/seq.h
index ecc1d8dd5..ba18c2dbf 100644
--- a/thrust/detail/seq.h
+++ b/thrust/detail/seq.h
@@ -20,8 +20,7 @@
 #include <thrust/detail/allocator_aware_execution_policy.h>
 #include <thrust/system/detail/sequential/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 
@@ -31,7 +30,7 @@ struct seq_t : thrust::system::detail::sequential::execution_policy<seq_t>,
     thrust::system::detail::sequential::execution_policy>
 {
   __host__ __device__
-  seq_t() : thrust::system::detail::sequential::execution_policy<seq_t>() {}
+  constexpr seq_t() : thrust::system::detail::sequential::execution_policy<seq_t>() {}
 
   // allow any execution_policy to convert to seq_t
   template<typename DerivedPolicy>
@@ -45,13 +44,9 @@ struct seq_t : thrust::system::detail::sequential::execution_policy<seq_t>,
 } // end detail
 
 
-#ifdef __CUDA_ARCH__
-static const __device__ detail::seq_t seq;
-#else
-static const detail::seq_t seq;
-#endif
+THRUST_INLINE_CONSTANT detail::seq_t seq;
 
 
-} // end thrust
+THRUST_NAMESPACE_END
 
 
diff --git a/thrust/detail/sequence.inl b/thrust/detail/sequence.inl
index fff7cbb63..ffc9b968b 100644
--- a/thrust/detail/sequence.inl
+++ b/thrust/detail/sequence.inl
@@ -14,10 +14,7 @@
  *  limitations under the License.
  */
 
-
-/*! \file sequence.inl
- *  \brief Inline file for sequence.h.
- */
+#pragma once
 
 #include <thrust/detail/config.h>
 #include <thrust/sequence.h>
@@ -26,8 +23,7 @@
 #include <thrust/system/detail/generic/sequence.h>
 #include <thrust/system/detail/adl/sequence.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 
 __thrust_exec_check_disable__
@@ -114,5 +110,5 @@ template<typename ForwardIterator, typename T>
 } // end sequence()
 
 
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/set_operations.inl b/thrust/detail/set_operations.inl
index 42cf5ed35..7915f7b3e 100644
--- a/thrust/detail/set_operations.inl
+++ b/thrust/detail/set_operations.inl
@@ -14,9 +14,7 @@
  *  limitations under the License.
  */
 
-/*! \file set_operations.inl
- *  \brief Inline file for set_operations.h.
- */
+#pragma once
 
 #include <thrust/detail/config.h>
 #include <thrust/iterator/iterator_traits.h>
@@ -24,8 +22,7 @@
 #include <thrust/system/detail/generic/set_operations.h>
 #include <thrust/system/detail/adl/set_operations.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 
 __thrust_exec_check_disable__
@@ -864,5 +861,5 @@ template<typename InputIterator1,
 } // end set_union_by_key()
 
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/shuffle.inl b/thrust/detail/shuffle.inl
new file mode 100644
index 000000000..48f5ba639
--- /dev/null
+++ b/thrust/detail/shuffle.inl
@@ -0,0 +1,83 @@
+/*
+ *  Copyright 2008-2020 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp11_required.h>
+
+#if THRUST_CPP_DIALECT >= 2011
+
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/shuffle.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/generic/shuffle.h>
+
+THRUST_NAMESPACE_BEGIN
+
+__thrust_exec_check_disable__
+template <typename DerivedPolicy, typename RandomIterator, typename URBG>
+__host__ __device__ void shuffle(
+    const thrust::detail::execution_policy_base<DerivedPolicy>& exec,
+    RandomIterator first, RandomIterator last, URBG&& g) {
+  using thrust::system::detail::generic::shuffle;
+  return shuffle(
+      thrust::detail::derived_cast(thrust::detail::strip_const(exec)),
+      first, last, g);
+}
+
+template <typename RandomIterator, typename URBG>
+__host__ __device__ void shuffle(RandomIterator first, RandomIterator last,
+                                 URBG&& g) {
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<RandomIterator>::type System;
+  System system;
+
+  return thrust::shuffle(select_system(system), first, last, g);
+}
+
+__thrust_exec_check_disable__
+template <typename DerivedPolicy, typename RandomIterator,
+          typename OutputIterator, typename URBG>
+__host__ __device__ void shuffle_copy(
+    const thrust::detail::execution_policy_base<DerivedPolicy>& exec,
+    RandomIterator first, RandomIterator last, OutputIterator result,
+    URBG&& g) {
+  using thrust::system::detail::generic::shuffle_copy;
+  return shuffle_copy(
+      thrust::detail::derived_cast(thrust::detail::strip_const(exec)),
+      first, last, result, g);
+}
+
+template <typename RandomIterator, typename OutputIterator, typename URBG>
+__host__ __device__ void shuffle_copy(RandomIterator first, RandomIterator last,
+                                      OutputIterator result, URBG&& g) {
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<RandomIterator>::type System1;
+  typedef typename thrust::iterator_system<OutputIterator>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::shuffle_copy(select_system(system1, system2), first, last,
+                              result, g);
+}
+
+THRUST_NAMESPACE_END
+
+#endif
diff --git a/thrust/detail/sort.inl b/thrust/detail/sort.inl
index d4a7901e6..53f8bad93 100644
--- a/thrust/detail/sort.inl
+++ b/thrust/detail/sort.inl
@@ -14,10 +14,7 @@
  *  limitations under the License.
  */
 
-
-/*! \file sort.inl
- *  \brief Inline file for sort.h.
- */
+#pragma once
 
 #include <thrust/detail/config.h>
 #include <thrust/sort.h>
@@ -26,8 +23,7 @@
 #include <thrust/system/detail/generic/sort.h>
 #include <thrust/system/detail/adl/sort.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 
 __thrust_exec_check_disable__
@@ -244,7 +240,7 @@ template<typename RandomAccessIterator>
   System system;
 
   return thrust::stable_sort(select_system(system), first, last);
-} // end stable_sort() 
+} // end stable_sort()
 
 
 template<typename RandomAccessIterator,
@@ -349,7 +345,7 @@ template<typename ForwardIterator>
                  ForwardIterator last)
 {
   using thrust::system::detail::generic::select_system;
-  
+
   typedef typename thrust::iterator_system<ForwardIterator>::type System;
 
   System system;
@@ -365,7 +361,7 @@ template<typename ForwardIterator,
                  Compare comp)
 {
   using thrust::system::detail::generic::select_system;
-  
+
   typedef typename thrust::iterator_system<ForwardIterator>::type System;
 
   System system;
@@ -379,7 +375,7 @@ template<typename ForwardIterator>
                                   ForwardIterator last)
 {
   using thrust::system::detail::generic::select_system;
-  
+
   typedef typename thrust::iterator_system<ForwardIterator>::type System;
 
   System system;
@@ -395,7 +391,7 @@ template<typename ForwardIterator,
                                   Compare comp)
 {
   using thrust::system::detail::generic::select_system;
-  
+
   typedef typename thrust::iterator_system<ForwardIterator>::type System;
 
   System system;
@@ -404,5 +400,5 @@ template<typename ForwardIterator,
 } // end is_sorted_until()
 
 
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/static_assert.h b/thrust/detail/static_assert.h
index 66d7eb70f..0e6132790 100644
--- a/thrust/detail/static_assert.h
+++ b/thrust/detail/static_assert.h
@@ -29,7 +29,7 @@
 #include <thrust/detail/type_traits.h>
 #include <thrust/detail/preprocessor.h>
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
 
 namespace detail
 {
@@ -65,16 +65,16 @@ template <int x> struct static_assert_test {};
   // Clang and GCC 4.8+ will complain about this typedef being unused unless we
   // annotate it as such.
 #  define THRUST_STATIC_ASSERT(B)                                             \
-    typedef ::thrust::detail::static_assert_test<                             \
-      sizeof(::thrust::detail::STATIC_ASSERTION_FAILURE<(bool)(B)>)           \
+    typedef THRUST_NS_QUALIFIER::detail::static_assert_test<                  \
+      sizeof(THRUST_NS_QUALIFIER::detail::STATIC_ASSERTION_FAILURE<(bool)(B)>)\
     >                                                                         \
       THRUST_PP_CAT2(thrust_static_assert_typedef_, __LINE__)                 \
       __attribute__((unused))                                                 \
     /**/      
 #else
 #  define THRUST_STATIC_ASSERT(B)                                             \
-    typedef ::thrust::detail::static_assert_test<                             \
-      sizeof(::thrust::detail::STATIC_ASSERTION_FAILURE<(bool)(B)>)           \
+    typedef THRUST_NS_QUALIFIER::detail::static_assert_test<                  \
+      sizeof(THRUST_NS_QUALIFIER::detail::STATIC_ASSERTION_FAILURE<(bool)(B)>)\
     >                                                                         \
       THRUST_PP_CAT2(thrust_static_assert_typedef_, __LINE__)                 \
     /**/      
@@ -86,6 +86,6 @@ template <int x> struct static_assert_test {};
 
 } // namespace detail
 
-THRUST_END_NS
+THRUST_NAMESPACE_END
 
 
diff --git a/thrust/detail/static_map.h b/thrust/detail/static_map.h
index 872a73aef..9f0d79e83 100644
--- a/thrust/detail/static_map.h
+++ b/thrust/detail/static_map.h
@@ -20,8 +20,7 @@
 #include <thrust/detail/config.h>
 
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 namespace static_map_detail
@@ -166,5 +165,5 @@ unsigned int lookup(unsigned int key)
 
 
 } // end namespace detail
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/swap.h b/thrust/detail/swap.h
index 96783c762..305750f8a 100644
--- a/thrust/detail/swap.h
+++ b/thrust/detail/swap.h
@@ -19,8 +19,7 @@
 
 #include <thrust/detail/config.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 __thrust_exec_check_disable__
 template<typename Assignable1, typename Assignable2>
@@ -32,5 +31,5 @@ inline void swap(Assignable1 &a, Assignable2 &b)
   b = temp;
 } // end swap()
 
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/swap.inl b/thrust/detail/swap.inl
index 9364ef8ad..196c34f41 100644
--- a/thrust/detail/swap.inl
+++ b/thrust/detail/swap.inl
@@ -14,6 +14,7 @@
  *  limitations under the License.
  */
 
+#pragma once
 
 #include <thrust/swap.h>
 #include <thrust/detail/swap.h>
diff --git a/thrust/detail/swap_ranges.inl b/thrust/detail/swap_ranges.inl
index 8ed97cc74..1f35c1ff3 100644
--- a/thrust/detail/swap_ranges.inl
+++ b/thrust/detail/swap_ranges.inl
@@ -14,10 +14,9 @@
  *  limitations under the License.
  */
 
+#pragma once
 
-/*! \file swap_ranges.inl
- *  \brief Inline file for swap_ranges.h.
- */
+#include <thrust/detail/config.h>
 
 #include <thrust/swap.h>
 #include <thrust/iterator/iterator_traits.h>
@@ -25,8 +24,7 @@
 #include <thrust/system/detail/generic/swap_ranges.h>
 #include <thrust/system/detail/adl/swap_ranges.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 
 __thrust_exec_check_disable__
@@ -62,5 +60,5 @@ template<typename ForwardIterator1,
 } // end swap_ranges()
 
 
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/tabulate.inl b/thrust/detail/tabulate.inl
index f6385234e..308be061f 100644
--- a/thrust/detail/tabulate.inl
+++ b/thrust/detail/tabulate.inl
@@ -14,6 +14,8 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/tabulate.h>
 #include <thrust/iterator/iterator_traits.h>
@@ -21,8 +23,7 @@
 #include <thrust/system/detail/generic/tabulate.h>
 #include <thrust/system/detail/adl/tabulate.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 
 __thrust_exec_check_disable__
@@ -53,5 +54,5 @@ template<typename ForwardIterator, typename UnaryOperation>
 } // end tabulate()
 
 
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/temporary_array.h b/thrust/detail/temporary_array.h
index 1511d2b78..cf4bc7d2d 100644
--- a/thrust/detail/temporary_array.h
+++ b/thrust/detail/temporary_array.h
@@ -20,8 +20,9 @@
 
 #pragma once
 
-namespace thrust
-{
+#include <thrust/detail/config.h>
+
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 
@@ -31,7 +32,7 @@ template<typename T, typename System>
   class temporary_array;
 
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/config.h>
 #include <thrust/iterator/iterator_traits.h>
@@ -39,10 +40,9 @@ template<typename T, typename System>
 #include <thrust/detail/contiguous_storage.h>
 #include <thrust/detail/allocator/temporary_allocator.h>
 #include <thrust/detail/allocator/no_throw_allocator.h>
-#include <memory>
+#include <thrust/detail/memory_wrapper.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 
@@ -175,7 +175,7 @@ template<typename Iterator, typename FromSystem, typename ToSystem>
 
 
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/temporary_array.inl>
 
diff --git a/thrust/detail/temporary_array.inl b/thrust/detail/temporary_array.inl
index e730966c0..90b7279ac 100644
--- a/thrust/detail/temporary_array.inl
+++ b/thrust/detail/temporary_array.inl
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2021 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -14,14 +14,16 @@
  *  limitations under the License.
  */
 
+#pragma once
+
+#include <thrust/detail/config.h>
+
 #include <thrust/detail/temporary_array.h>
 #include <thrust/distance.h>
 #include <thrust/system/detail/generic/select_system.h>
 #include <thrust/detail/type_traits.h>
 
-
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace detail
 {
@@ -162,5 +164,5 @@ __host__ __device__
 
 } // end detail
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/temporary_buffer.h b/thrust/detail/temporary_buffer.h
index 6eb68de49..be95e7180 100644
--- a/thrust/detail/temporary_buffer.h
+++ b/thrust/detail/temporary_buffer.h
@@ -25,12 +25,9 @@
 #include <thrust/system/detail/generic/temporary_buffer.h>
 #include <thrust/system/detail/adl/temporary_buffer.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
-namespace get_temporary_buffer_detail
-{
 
 
 template<typename T, typename DerivedPolicy, typename Pair>
@@ -46,7 +43,6 @@ __host__ __device__
 } // end down_cast_pair()
 
 
-} // end get_temporary_buffer_detail
 } // end detail
 
 
@@ -59,21 +55,21 @@ __host__ __device__
   using thrust::detail::get_temporary_buffer; // execute_with_allocator
   using thrust::system::detail::generic::get_temporary_buffer;
 
-  return thrust::detail::get_temporary_buffer_detail::down_cast_pair<T,DerivedPolicy>(get_temporary_buffer<T>(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), n));
+  return thrust::detail::down_cast_pair<T,DerivedPolicy>(get_temporary_buffer<T>(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), n));
 } // end get_temporary_buffer()
 
 
 __thrust_exec_check_disable__
 template<typename DerivedPolicy, typename Pointer>
 __host__ __device__
-  void return_temporary_buffer(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, Pointer p)
+  void return_temporary_buffer(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, Pointer p, std::ptrdiff_t n)
 {
   using thrust::detail::return_temporary_buffer; // execute_with_allocator
   using thrust::system::detail::generic::return_temporary_buffer;
 
-  return return_temporary_buffer(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), p);
+  return return_temporary_buffer(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), p, n);
 } // end return_temporary_buffer()
 
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/transform.inl b/thrust/detail/transform.inl
index c27e4de27..62bafd35e 100644
--- a/thrust/detail/transform.inl
+++ b/thrust/detail/transform.inl
@@ -14,10 +14,9 @@
  *  limitations under the License.
  */
 
+#pragma once
 
-/*! \file transform.inl
- *  \brief Inline file for transform.h.
- */
+#include <thrust/detail/config.h>
 
 #include <thrust/transform.h>
 #include <thrust/iterator/iterator_traits.h>
@@ -25,8 +24,7 @@
 #include <thrust/system/detail/generic/transform.h>
 #include <thrust/system/detail/adl/transform.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 
 __thrust_exec_check_disable__
@@ -245,5 +243,5 @@ template<typename InputIterator1,
 } // end transform_if()
 
 
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/transform_reduce.inl b/thrust/detail/transform_reduce.inl
index 571b0e79b..702dd9f73 100644
--- a/thrust/detail/transform_reduce.inl
+++ b/thrust/detail/transform_reduce.inl
@@ -14,10 +14,7 @@
  *  limitations under the License.
  */
 
-
-/*! \file transform_reduce.inl
- *  \brief Inline file for transform_reduce.h.
- */
+#pragma once
 
 #include <thrust/detail/config.h>
 #include <thrust/iterator/iterator_traits.h>
@@ -25,14 +22,13 @@
 #include <thrust/system/detail/generic/transform_reduce.h>
 #include <thrust/system/detail/adl/transform_reduce.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 
 __thrust_exec_check_disable__
 template<typename DerivedPolicy,
-         typename InputIterator, 
-         typename UnaryFunction, 
+         typename InputIterator,
+         typename UnaryFunction,
          typename OutputType,
          typename BinaryFunction>
 __host__ __device__
@@ -48,8 +44,8 @@ __host__ __device__
 } // end transform_reduce()
 
 
-template<typename InputIterator, 
-         typename UnaryFunction, 
+template<typename InputIterator,
+         typename UnaryFunction,
          typename OutputType,
          typename BinaryFunction>
   OutputType transform_reduce(InputIterator first,
@@ -68,5 +64,5 @@ template<typename InputIterator,
 } // end transform_reduce()
 
 
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/transform_scan.inl b/thrust/detail/transform_scan.inl
index d6a488b0a..957001cef 100644
--- a/thrust/detail/transform_scan.inl
+++ b/thrust/detail/transform_scan.inl
@@ -14,10 +14,9 @@
  *  limitations under the License.
  */
 
+#pragma once
 
-/*! \file transform_scan.inl
- *  \brief Inline file for transform_scan.h.
- */
+#include <thrust/detail/config.h>
 
 #include <thrust/scan.h>
 #include <thrust/iterator/iterator_traits.h>
@@ -25,8 +24,7 @@
 #include <thrust/system/detail/generic/transform_scan.h>
 #include <thrust/system/detail/adl/transform_scan.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 
 __thrust_exec_check_disable__
@@ -115,5 +113,5 @@ template<typename InputIterator,
 } // end transform_exclusive_scan()
 
 
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/trivial_sequence.h b/thrust/detail/trivial_sequence.h
index b6c3ed9eb..2cf98e787 100644
--- a/thrust/detail/trivial_sequence.h
+++ b/thrust/detail/trivial_sequence.h
@@ -23,14 +23,15 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/detail/type_traits.h>
 #include <thrust/detail/execution_policy.h>
 #include <thrust/detail/temporary_array.h>
 #include <thrust/type_traits/is_contiguous_iterator.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace detail
 {
@@ -91,5 +92,5 @@ struct trivial_sequence
 
 } // end namespace detail
 
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/tuple.inl b/thrust/detail/tuple.inl
index 7fe1567f2..f4930bf4b 100644
--- a/thrust/detail/tuple.inl
+++ b/thrust/detail/tuple.inl
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2018 NVIDIA Corporation
+ *  Copyright 2008-2021 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -14,11 +14,14 @@
  *  limitations under the License.
  */
 
+#pragma once
+
+#include <thrust/detail/config.h>
+
 #include <thrust/detail/type_traits.h>
 #include <thrust/detail/swap.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 // define null_type
 struct null_type {};
@@ -50,38 +53,79 @@ template <
   class T9 = null_type>
 class tuple;
 
-// forward declaration of tuple_element
-template<int i, typename T> struct tuple_element;
 
-// specializations for tuple_element
-template<class T>
-  struct tuple_element<0,T>
-{
-  typedef typename T::head_type type;
-}; // end tuple_element<0,T>
+template <size_t N, class T> struct tuple_element;
 
-template<int N, class T>
-  struct tuple_element<N, const T>
+template<size_t N, class T>
+  struct tuple_element_impl
 {
   private:
     typedef typename T::tail_type Next;
-    typedef typename tuple_element<N-1, Next>::type unqualified_type;
 
   public:
-    typedef typename thrust::detail::add_const<unqualified_type>::type type;
-}; // end tuple_element<N, const T>
+    /*! The result of this metafunction is returned in \c type.
+     */
+    typedef typename tuple_element_impl<N-1, Next>::type type;
+}; // end tuple_element
 
 template<class T>
-  struct tuple_element<0,const T>
+  struct tuple_element_impl<0,T>
 {
-  typedef typename thrust::detail::add_const<typename T::head_type>::type type;
-}; // end tuple_element<0,const T>
+  typedef typename T::head_type type;
+};
 
+template <size_t N, class T>
+  struct tuple_element<N, T const>
+{
+    using type = typename std::add_const<typename tuple_element<N, T>::type>::type;
+};
+
+template <size_t N, class T>
+struct tuple_element<N, T volatile>
+{
+    using type = typename std::add_volatile<typename tuple_element<N, T>::type>::type;
+};
 
+template <size_t N, class T>
+  struct tuple_element<N, T const volatile>
+{
+    using type = typename std::add_cv<typename tuple_element<N, T>::type>::type;
+};
+
+template <size_t N, class T>
+struct tuple_element{
+    using type = typename tuple_element_impl<N,T>::type;
+};
 
 // forward declaration of tuple_size
 template<class T> struct tuple_size;
 
+template<class T>
+  struct tuple_size<T const> : public tuple_size<T> {};
+
+template<class T>
+  struct tuple_size<T volatile> : public tuple_size<T> {};
+
+template<class T>
+  struct tuple_size<T const volatile> : public tuple_size<T> {};
+
+/*! This metafunction returns the number of elements
+ *  of a \p tuple type of interest.
+ *
+ *  \tparam T A \c tuple type of interest.
+ *
+ *  \see pair
+ *  \see tuple
+ */
+template<class T>
+  struct tuple_size
+{
+  /*! The result of this metafunction is returned in \c value.
+   */
+  static const int value = 1 + tuple_size<typename T::tail_type>::value;
+}; // end tuple_size
+
+
 // specializations for tuple_size
 template<>
   struct tuple_size< tuple<> >
@@ -169,7 +213,7 @@ struct get_class
     // XXX we may not need to deal with this for any compiler we care about -jph
     //return get_class<N-1>::BOOST_NESTED_TEMPLATE get<RET>(t.tail);
     return get_class<N-1>::template get<RET>(t.tail);
-    
+
     // gcc 4.3 couldn't compile this:
     //return get_class<N-1>::get<RET>(t.tail);
   }
@@ -309,6 +353,10 @@ template <class HT, class TT>
   inline __host__ __device__
   cons( const cons<HT2, TT2>& u ) : head(u.head), tail(u.tail) {}
 
+#if THRUST_CPP_DIALECT >= 2011
+  cons(const cons &) = default;
+#endif
+
   __thrust_exec_check_disable__
   template <class HT2, class TT2>
   inline __host__ __device__
@@ -412,6 +460,10 @@ template <class HT>
   inline __host__ __device__
   cons( const cons<HT2, null_type>& u ) : head(u.head) {}
 
+#if THRUST_CPP_DIALECT >= 2011
+  cons(const cons &) = default;
+#endif
+
   __thrust_exec_check_disable__
   template <class HT2>
   inline __host__ __device__
@@ -590,7 +642,7 @@ inline typename access_traits<
 get(detail::cons<HT, TT>& c)
 {
   //return detail::get_class<N>::BOOST_NESTED_TEMPLATE
-  
+
   // gcc 4.3 couldn't compile this:
   //return detail::get_class<N>::
 
@@ -948,5 +1000,5 @@ inline bool operator>=(const detail::cons<T1, T2>& lhs, const detail::cons<S1, S
   return detail::gte(lhs, rhs);
 } // end operator>=()
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/tuple_algorithms.h b/thrust/detail/tuple_algorithms.h
index 2c506b077..2e49f4281 100644
--- a/thrust/detail/tuple_algorithms.h
+++ b/thrust/detail/tuple_algorithms.h
@@ -26,7 +26,7 @@
 
 #include <tuple>
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
 
 template <typename Tuple, std::size_t... Is>
 auto tuple_subset(Tuple&& t, index_sequence<Is...>)
@@ -39,7 +39,7 @@ template <typename Tuple, typename F, std::size_t... Is>
 void tuple_for_each_impl(Tuple&& t, F&& f, index_sequence<Is...>)
 {
   auto l = { (f(std::get<Is>(t)), 0)... };
-  THRUST_UNUSED(l);
+  THRUST_UNUSED_VAR(l);
 }
 
 template <typename Tuple, typename F, std::size_t... Is>
@@ -104,7 +104,7 @@ THRUST_DECLTYPE_RETURNS(
   )
 );
 
-THRUST_END_NS
+THRUST_NAMESPACE_END
 
 #endif // THRUST_CPP_DIALECT >= 2011
 
diff --git a/thrust/detail/tuple_meta_transform.h b/thrust/detail/tuple_meta_transform.h
index 4aca1a91b..285cae8b4 100644
--- a/thrust/detail/tuple_meta_transform.h
+++ b/thrust/detail/tuple_meta_transform.h
@@ -16,162 +16,43 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/tuple.h>
+#include <thrust/type_traits/integer_sequence.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace detail
 {
 
+// introduce an intermediate type tuple_meta_transform_WAR_NVCC
+// rather than directly specializing tuple_meta_transform with
+// default argument IndexSequence = thrust::make_index_sequence<thrust::tuple_size<Tuple>::value>
+// to workaround nvcc 11.0 compiler bug
 template<typename Tuple,
          template<typename> class UnaryMetaFunction,
-         unsigned int sz = thrust::tuple_size<Tuple>::value>
-  struct tuple_meta_transform;
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction>
-  struct tuple_meta_transform<Tuple,UnaryMetaFunction,0>
-{
-  typedef null_type type;
-};
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction>
-  struct tuple_meta_transform<Tuple,UnaryMetaFunction,1>
-{
-  typedef thrust::tuple<
-    typename UnaryMetaFunction<typename thrust::tuple_element<0,Tuple>::type>::type
-  > type;
-};
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction>
-  struct tuple_meta_transform<Tuple,UnaryMetaFunction,2>
-{
-  typedef thrust::tuple<
-    typename UnaryMetaFunction<typename thrust::tuple_element<0,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<1,Tuple>::type>::type
-  > type;
-};
+         typename IndexSequence>
+  struct tuple_meta_transform_WAR_NVCC;
 
 template<typename Tuple,
-         template<typename> class UnaryMetaFunction>
-  struct tuple_meta_transform<Tuple,UnaryMetaFunction,3>
-{
-  typedef thrust::tuple<
-    typename UnaryMetaFunction<typename thrust::tuple_element<0,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<1,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<2,Tuple>::type>::type
-  > type;
-};
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction>
-  struct tuple_meta_transform<Tuple,UnaryMetaFunction,4>
-{
-  typedef thrust::tuple<
-    typename UnaryMetaFunction<typename thrust::tuple_element<0,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<1,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<2,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<3,Tuple>::type>::type
-  > type;
-};
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction>
-  struct tuple_meta_transform<Tuple,UnaryMetaFunction,5>
-{
-  typedef thrust::tuple<
-    typename UnaryMetaFunction<typename thrust::tuple_element<0,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<1,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<2,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<3,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<4,Tuple>::type>::type
-  > type;
-};
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction>
-  struct tuple_meta_transform<Tuple,UnaryMetaFunction,6>
-{
-  typedef thrust::tuple<
-    typename UnaryMetaFunction<typename thrust::tuple_element<0,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<1,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<2,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<3,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<4,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<5,Tuple>::type>::type
-  > type;
-};
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction>
-  struct tuple_meta_transform<Tuple,UnaryMetaFunction,7>
-{
-  typedef thrust::tuple<
-    typename UnaryMetaFunction<typename thrust::tuple_element<0,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<1,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<2,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<3,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<4,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<5,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<6,Tuple>::type>::type
-  > type;
-};
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction>
-  struct tuple_meta_transform<Tuple,UnaryMetaFunction,8>
-{
-  typedef thrust::tuple<
-    typename UnaryMetaFunction<typename thrust::tuple_element<0,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<1,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<2,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<3,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<4,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<5,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<6,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<7,Tuple>::type>::type
-  > type;
-};
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction>
-  struct tuple_meta_transform<Tuple,UnaryMetaFunction,9>
+         template<typename> class UnaryMetaFunction,
+         size_t... Is>
+  struct tuple_meta_transform_WAR_NVCC<Tuple, UnaryMetaFunction, thrust::index_sequence<Is...>>
 {
   typedef thrust::tuple<
-    typename UnaryMetaFunction<typename thrust::tuple_element<0,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<1,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<2,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<3,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<4,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<5,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<6,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<7,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<8,Tuple>::type>::type
+    typename UnaryMetaFunction<typename thrust::tuple_element<Is,Tuple>::type>::type...
   > type;
 };
 
 template<typename Tuple,
          template<typename> class UnaryMetaFunction>
-  struct tuple_meta_transform<Tuple,UnaryMetaFunction,10>
+  struct tuple_meta_transform
 {
-  typedef thrust::tuple<
-    typename UnaryMetaFunction<typename thrust::tuple_element<0,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<1,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<2,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<3,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<4,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<5,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<6,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<7,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<8,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<9,Tuple>::type>::type
-  > type;
+  typedef typename tuple_meta_transform_WAR_NVCC<Tuple, UnaryMetaFunction, thrust::make_index_sequence<thrust::tuple_size<Tuple>::value>>::type type;
 };
 
 } // end detail
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/tuple_transform.h b/thrust/detail/tuple_transform.h
index 166fab3cb..1011d5179 100644
--- a/thrust/detail/tuple_transform.h
+++ b/thrust/detail/tuple_transform.h
@@ -16,11 +16,12 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/tuple.h>
 #include <thrust/detail/tuple_meta_transform.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace detail
 {
@@ -28,332 +29,15 @@ namespace detail
 template<typename Tuple,
          template<typename> class UnaryMetaFunction,
          typename UnaryFunction,
-         unsigned int sz = thrust::tuple_size<Tuple>::value>
+         typename IndexSequence = thrust::make_index_sequence<thrust::tuple_size<Tuple>::value>>
   struct tuple_transform_functor;
 
 
 template<typename Tuple,
          template<typename> class UnaryMetaFunction,
-         typename UnaryFunction>
-  struct tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction,0>
-{
-  static __host__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host(const Tuple &, UnaryFunction)
-  {
-    return thrust::null_type();
-  }
-
-  static __host__ __device__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host_or_device(const Tuple &, UnaryFunction)
-  {
-    return thrust::null_type();
-  }
-};
-
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction,
-         typename UnaryFunction>
-  struct tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction,1>
-{
-  static __host__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)));
-  }
-
-  static __host__ __device__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host_or_device(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)));
-  }
-};
-
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction,
-         typename UnaryFunction>
-  struct tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction,2>
-{
-  static __host__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)),
-                     f(thrust::get<1>(t)));
-  }
-
-  static __host__ __device__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host_or_device(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)),
-                     f(thrust::get<1>(t)));
-  }
-};
-
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction,
-         typename UnaryFunction>
-  struct tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction,3>
-{
-  static __host__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)),
-                     f(thrust::get<1>(t)),
-                     f(thrust::get<2>(t)));
-  }
-
-  static __host__ __device__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host_or_device(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)),
-                     f(thrust::get<1>(t)),
-                     f(thrust::get<2>(t)));
-  }
-};
-
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction,
-         typename UnaryFunction>
-  struct tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction,4>
-{
-  static __host__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)),
-                     f(thrust::get<1>(t)),
-                     f(thrust::get<2>(t)),
-                     f(thrust::get<3>(t)));
-  }
-
-  static __host__ __device__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host_or_device(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)),
-                     f(thrust::get<1>(t)),
-                     f(thrust::get<2>(t)),
-                     f(thrust::get<3>(t)));
-  }
-};
-
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction,
-         typename UnaryFunction>
-  struct tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction,5>
-{
-  static __host__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)),
-                     f(thrust::get<1>(t)),
-                     f(thrust::get<2>(t)),
-                     f(thrust::get<3>(t)),
-                     f(thrust::get<4>(t)));
-  }
-
-  static __host__ __device__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host_or_device(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)),
-                     f(thrust::get<1>(t)),
-                     f(thrust::get<2>(t)),
-                     f(thrust::get<3>(t)),
-                     f(thrust::get<4>(t)));
-  }
-};
-
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction,
-         typename UnaryFunction>
-  struct tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction,6>
-{
-  static __host__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)),
-                     f(thrust::get<1>(t)),
-                     f(thrust::get<2>(t)),
-                     f(thrust::get<3>(t)),
-                     f(thrust::get<4>(t)),
-                     f(thrust::get<5>(t)));
-  }
-
-  static __host__ __device__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host_or_device(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)),
-                     f(thrust::get<1>(t)),
-                     f(thrust::get<2>(t)),
-                     f(thrust::get<3>(t)),
-                     f(thrust::get<4>(t)),
-                     f(thrust::get<5>(t)));
-  }
-};
-
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction,
-         typename UnaryFunction>
-  struct tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction,7>
-{
-  static __host__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)),
-                     f(thrust::get<1>(t)),
-                     f(thrust::get<2>(t)),
-                     f(thrust::get<3>(t)),
-                     f(thrust::get<4>(t)),
-                     f(thrust::get<5>(t)),
-                     f(thrust::get<6>(t)));
-  }
-
-  static __host__ __device__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host_or_device(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)),
-                     f(thrust::get<1>(t)),
-                     f(thrust::get<2>(t)),
-                     f(thrust::get<3>(t)),
-                     f(thrust::get<4>(t)),
-                     f(thrust::get<5>(t)),
-                     f(thrust::get<6>(t)));
-  }
-};
-
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction,
-         typename UnaryFunction>
-  struct tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction,8>
-{
-  static __host__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)),
-                     f(thrust::get<1>(t)),
-                     f(thrust::get<2>(t)),
-                     f(thrust::get<3>(t)),
-                     f(thrust::get<4>(t)),
-                     f(thrust::get<5>(t)),
-                     f(thrust::get<6>(t)),
-                     f(thrust::get<7>(t)));
-  }
-
-  static __host__ __device__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host_or_device(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)),
-                     f(thrust::get<1>(t)),
-                     f(thrust::get<2>(t)),
-                     f(thrust::get<3>(t)),
-                     f(thrust::get<4>(t)),
-                     f(thrust::get<5>(t)),
-                     f(thrust::get<6>(t)),
-                     f(thrust::get<7>(t)));
-  }
-};
-
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction,
-         typename UnaryFunction>
-  struct tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction,9>
-{
-  static __host__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)),
-                     f(thrust::get<1>(t)),
-                     f(thrust::get<2>(t)),
-                     f(thrust::get<3>(t)),
-                     f(thrust::get<4>(t)),
-                     f(thrust::get<5>(t)),
-                     f(thrust::get<6>(t)),
-                     f(thrust::get<7>(t)),
-                     f(thrust::get<8>(t)));
-  }
-
-  static __host__ __device__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host_or_device(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)),
-                     f(thrust::get<1>(t)),
-                     f(thrust::get<2>(t)),
-                     f(thrust::get<3>(t)),
-                     f(thrust::get<4>(t)),
-                     f(thrust::get<5>(t)),
-                     f(thrust::get<6>(t)),
-                     f(thrust::get<7>(t)),
-                     f(thrust::get<8>(t)));
-  }
-};
-
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction,
-         typename UnaryFunction>
-  struct tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction,10>
+         typename UnaryFunction,
+         size_t... Is>
+  struct tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction,thrust::index_sequence<Is...>>
 {
   static __host__
   typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
@@ -361,16 +45,7 @@ template<typename Tuple,
   {
     typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
 
-    return XfrmTuple(f(thrust::get<0>(t)),
-                     f(thrust::get<1>(t)),
-                     f(thrust::get<2>(t)),
-                     f(thrust::get<3>(t)),
-                     f(thrust::get<4>(t)),
-                     f(thrust::get<5>(t)),
-                     f(thrust::get<6>(t)),
-                     f(thrust::get<7>(t)),
-                     f(thrust::get<8>(t)),
-                     f(thrust::get<9>(t)));
+    return XfrmTuple(f(thrust::get<Is>(t))...);
   }
 
   static __host__ __device__
@@ -379,16 +54,7 @@ template<typename Tuple,
   {
     typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
 
-    return XfrmTuple(f(thrust::get<0>(t)),
-                     f(thrust::get<1>(t)),
-                     f(thrust::get<2>(t)),
-                     f(thrust::get<3>(t)),
-                     f(thrust::get<4>(t)),
-                     f(thrust::get<5>(t)),
-                     f(thrust::get<6>(t)),
-                     f(thrust::get<7>(t)),
-                     f(thrust::get<8>(t)),
-                     f(thrust::get<9>(t)));
+    return XfrmTuple(f(thrust::get<Is>(t))...);
   }
 };
 
@@ -414,5 +80,5 @@ tuple_host_device_transform(const Tuple &t, UnaryFunction f)
 
 } // end detail
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/type_deduction.h b/thrust/detail/type_deduction.h
index 735b31d68..6f240711d 100644
--- a/thrust/detail/type_deduction.h
+++ b/thrust/detail/type_deduction.h
@@ -51,22 +51,38 @@
 /// \brief Expands to a function definition, including a trailing returning
 ///        type, that returns the expression \c __VA_ARGS__.
 ///
-#define THRUST_DECLTYPE_RETURNS(...)                                          \
-  noexcept(noexcept(__VA_ARGS__))                                             \
-  -> decltype(__VA_ARGS__)                                                    \
+// Trailing return types seem to confuse Doxygen, and cause it to interpret
+// parts of the function's body as new function signatures.
+#if defined(THRUST_DOXYGEN)
+  #define THRUST_DECLTYPE_RETURNS(...)                                        \
   { return (__VA_ARGS__); }                                                   \
   /**/
+#else
+  #define THRUST_DECLTYPE_RETURNS(...)                                        \
+    noexcept(noexcept(__VA_ARGS__))                                           \
+    -> decltype(__VA_ARGS__)                                                  \
+    { return (__VA_ARGS__); }                                                 \
+    /**/
+#endif
 
 /// \def THRUST_DECLTYPE_RETURNS_WITH_SFINAE_CONDITION(condition, ...)
 /// \brief Expands to a function definition, including a trailing returning
-///        type, that returns the expression \c __VA_ARGS__. It shall only 
+///        type, that returns the expression \c __VA_ARGS__. It shall only
 ///        participate in overload resolution if \c condition is \c true.
 ///
-#define THRUST_DECLTYPE_RETURNS_WITH_SFINAE_CONDITION(condition, ...)         \
-  noexcept(noexcept(__VA_ARGS__))                                             \
-  -> typename std::enable_if<condition, decltype(__VA_ARGS__)>::type          \
+// Trailing return types seem to confuse Doxygen, and cause it to interpret
+// parts of the function's body as new function signatures.
+#if defined(THRUST_DOXYGEN)
+  #define THRUST_DECLTYPE_RETURNS(...)                                        \
   { return (__VA_ARGS__); }                                                   \
   /**/
+#else
+  #define THRUST_DECLTYPE_RETURNS_WITH_SFINAE_CONDITION(condition, ...)       \
+    noexcept(noexcept(__VA_ARGS__))                                           \
+    -> typename std::enable_if<condition, decltype(__VA_ARGS__)>::type        \
+    { return (__VA_ARGS__); }                                                 \
+    /**/
+#endif
 
 ///////////////////////////////////////////////////////////////////////////////
 
diff --git a/thrust/detail/type_traits.h b/thrust/detail/type_traits.h
index ad02ba6f9..f25eaeaf0 100644
--- a/thrust/detail/type_traits.h
+++ b/thrust/detail/type_traits.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2018 NVIDIA Corporation
+ *  Copyright 2008-2022 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -24,12 +24,11 @@
 
 #include <thrust/detail/config.h>
 
-#if THRUST_CPP_DIALECT >= 2011
-#  include <type_traits>
-#endif
+#include <cuda/std/type_traits>
 
-namespace thrust
-{
+#include <type_traits>
+
+THRUST_NAMESPACE_BEGIN
 
 // forward declaration of device_reference
 template<typename T> class device_reference;
@@ -48,22 +47,17 @@ namespace detail
      // We don't want to switch to std::integral_constant, because we want access
      // to the C++14 operator(), but we'd like standard traits to interoperate
      // with our version when tag dispatching.
-     #if THRUST_CPP_DIALECT >= 2011
-     constexpr integral_constant() = default;
+     integral_constant() = default;
 
-     constexpr integral_constant(integral_constant const&) = default;
+     integral_constant(integral_constant const&) = default;
 
-     #if THRUST_CPP_DIALECT >= 2014
-     constexpr // In C++11, constexpr makes member functions const.
-     #endif
      integral_constant& operator=(integral_constant const&) = default;
 
      constexpr __host__ __device__
-     integral_constant(std::integral_constant<T, v>) {}
-     #endif
+     integral_constant(std::integral_constant<T, v>) noexcept {}
 
-     THRUST_CONSTEXPR __host__ __device__ operator value_type() const THRUST_NOEXCEPT { return value; }
-     THRUST_CONSTEXPR __host__ __device__ value_type operator()() const THRUST_NOEXCEPT { return value; }
+     constexpr __host__ __device__ operator value_type() const noexcept { return value; }
+     constexpr __host__ __device__ value_type operator()() const noexcept { return value; }
    };
  
  /// typedef for true_type
@@ -134,7 +128,7 @@ template<typename T> struct is_pod
        || __is_pod(T)
 #elif THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC
 // only use the intrinsic for >= 4.3
-#if (__GNUC__ >= 4) && (__GNUC_MINOR__ >= 3)
+#if (__GNUC__ * 100 + __GNUC_MINOR__ >= 403)
        || __is_pod(T)
 #endif // GCC VERSION
 #endif // THRUST_HOST_COMPILER
@@ -142,36 +136,14 @@ template<typename T> struct is_pod
  {};
 
 
-template<typename T> struct has_trivial_constructor
-  : public integral_constant<
-      bool,
-      is_pod<T>::value
-#if THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC || \
-    THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_CLANG
-      || __has_trivial_constructor(T)
-#elif THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC
-// only use the intrinsic for >= 4.3
-#if (__GNUC__ >= 4) && (__GNUC_MINOR__ >= 3)
-      || __has_trivial_constructor(T)
-#endif // GCC VERSION
-#endif // THRUST_HOST_COMPILER
-      >
+template <typename T> 
+struct has_trivial_constructor
+  : public integral_constant<bool, is_pod<T>::value || ::cuda::std::is_trivially_constructible<T>::value> 
 {};
 
-template<typename T> struct has_trivial_copy_constructor
-  : public integral_constant<
-      bool,
-      is_pod<T>::value
-#if THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC || \
-    THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_CLANG
-      || __has_trivial_copy(T)
-#elif THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC
-// only use the intrinsic for >= 4.3
-#if (__GNUC__ >= 4) && (__GNUC_MINOR__ >= 3)
-      || __has_trivial_copy(T)
-#endif // GCC VERSION
-#endif // THRUST_HOST_COMPILER
-    >
+template<typename T> 
+struct has_trivial_copy_constructor
+  : public integral_constant<bool, is_pod<T>::value || ::cuda::std::is_trivially_copyable<T>::value>
 {};
 
 template<typename T> struct has_trivial_destructor : public is_pod<T> {};
@@ -394,22 +366,44 @@ template<typename T1, typename T2>
 
 
 // mpl stuff
+template<typename... Conditions>
+  struct or_;
 
-template <typename Condition1,               typename Condition2,              typename Condition3 = false_type,
-          typename Condition4  = false_type, typename Condition5 = false_type, typename Condition6 = false_type,
-          typename Condition7  = false_type, typename Condition8 = false_type, typename Condition9 = false_type,
-          typename Condition10 = false_type>
-  struct or_
+template <>
+  struct or_<>
     : public integral_constant<
         bool,
-        Condition1::value || Condition2::value || Condition3::value || Condition4::value || Condition5::value || Condition6::value || Condition7::value || Condition8::value || Condition9::value || Condition10::value
+        false_type::value  // identity for or_
       >
 {
 }; // end or_
 
-template <typename Condition1, typename Condition2, typename Condition3 = true_type>
-  struct and_
-    : public integral_constant<bool, Condition1::value && Condition2::value && Condition3::value>
+template <typename Condition, typename... Conditions>
+  struct or_<Condition, Conditions...>
+    : public integral_constant<
+        bool,
+        Condition::value || or_<Conditions...>::value
+      >
+{
+}; // end or_
+
+template <typename... Conditions>
+  struct and_;
+
+template<>
+  struct and_<>
+    : public integral_constant<
+        bool,
+        true_type::value // identity for and_
+      >
+{
+}; // end and_
+
+template <typename Condition, typename... Conditions>
+  struct and_<Condition, Conditions...>
+    : public integral_constant<
+        bool,
+        Condition::value && and_<Conditions...>::value>
 {
 }; // end and_
 
@@ -550,15 +544,7 @@ template<typename T>
 
 struct largest_available_float
 {
-#if defined(__CUDA_ARCH__)
-#  if (__CUDA_ARCH__ < 130)
-  typedef float type;
-#  else
   typedef double type;
-#  endif
-#else
-  typedef double type;
-#endif
 };
 
 // T1 wins if they are both the same size
@@ -632,7 +618,7 @@ template<typename T1, typename T2>
 
   template<typename T> static typename add_reference<T>::type declval();
   
-  template<unsigned int> struct helper { typedef void * type; };
+  template<size_t> struct helper { typedef void * type; };
 
   template<typename U1, typename U2> static yes_type test(typename helper<sizeof(declval<U1>() = declval<U2>())>::type);
 
@@ -705,13 +691,27 @@ template<typename T>
   {
   };
 
+template <typename Invokable, typename... Args>
+using invoke_result_t =
+#if THRUST_CPP_DIALECT < 2017
+  typename ::cuda::std::result_of<Invokable(Args...)>::type;
+#else // 2017+
+  ::cuda::std::invoke_result_t<Invokable, Args...>;
+#endif
+
+template <class F, class... Us> 
+struct invoke_result
+{
+  using type = invoke_result_t<F, Us...>;
+};
+
 } // end detail
 
 using detail::integral_constant;
 using detail::true_type;
 using detail::false_type;
 
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/type_traits/has_trivial_assign.h>
 
diff --git a/thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h b/thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h
deleted file mode 100644
index f221c915f..000000000
--- a/thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/type_traits.h>
-#include <thrust/detail/type_traits/function_traits.h>
-#include <thrust/detail/type_traits/iterator/is_output_iterator.h>
-
-namespace thrust
-{
-
-namespace detail
-{
-
-// this trait reports what type should be used as a temporary in certain algorithms
-// which aggregate intermediate results from a function before writing to an output iterator
-
-// the pseudocode for deducing the type of the temporary used below:
-// 
-// if Function is an AdaptableFunction
-//   result = Function::result_type
-// else if OutputIterator2 is a "pure" output iterator
-//   result = InputIterator2::value_type
-// else
-//   result = OutputIterator2::value_type
-//
-// XXX upon c++0x, TemporaryType needs to be:
-// result_of_adaptable_function<BinaryFunction>::type
-template<typename InputIterator, typename OutputIterator, typename Function>
-  struct intermediate_type_from_function_and_iterators
-    : eval_if<
-        has_result_type<Function>::value,
-        result_type<Function>,
-        eval_if<
-          is_output_iterator<OutputIterator>::value,
-          thrust::iterator_value<InputIterator>,
-          thrust::iterator_value<OutputIterator>
-        >
-      >
-{
-}; // end intermediate_type_from_function_and_iterators
-
-} // end detail
-
-} // end thrust
-
diff --git a/thrust/detail/type_traits/function_traits.h b/thrust/detail/type_traits/function_traits.h
index 0c7775c0d..109820136 100644
--- a/thrust/detail/type_traits/function_traits.h
+++ b/thrust/detail/type_traits/function_traits.h
@@ -16,11 +16,12 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/detail/type_traits.h>
 #include <thrust/detail/type_traits/has_nested_type.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 // forward definitions for is_commutative
 template <typename T> struct plus;
@@ -92,5 +93,5 @@ template<typename T> struct is_commutative< typename thrust::bit_and<T>     > :
 template<typename T> struct is_commutative< typename thrust::bit_xor<T>     > : public thrust::detail::is_arithmetic<T> {};
 
 } // end namespace detail
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/type_traits/has_member_function.h b/thrust/detail/type_traits/has_member_function.h
index 03ed61b6d..c33fe28f6 100644
--- a/thrust/detail/type_traits/has_member_function.h
+++ b/thrust/detail/type_traits/has_member_function.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2021 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -18,101 +18,21 @@
 
 #include <thrust/detail/type_traits.h>
 
-#define __THRUST_DEFINE_HAS_MEMBER_FUNCTION(trait_name, member_function_name)                                \
-template<typename T, typename Signature> class trait_name;                                                   \
-                                                                                                             \
-template<typename T, typename Result>                                                                        \
-class trait_name<T, Result(void)>                                                                            \
-{                                                                                                            \
-   class yes { char m; };                                                                                    \
-   class no { yes m[2]; };                                                                                   \
-   struct base_mixin                                                                                         \
-   {                                                                                                         \
-     Result member_function_name();                                                                          \
-   };                                                                                                        \
-   struct base : public T, public base_mixin {};                                                             \
-   template <typename U, U t>  class helper{};                                                               \
-   template <typename U>                                                                                     \
-   static no deduce(U*, helper<Result (base_mixin::*)(), &U::member_function_name>* = 0);                    \
-   static yes deduce(...);                                                                                   \
-public:                                                                                                      \
-   static const bool value = sizeof(yes) == sizeof(deduce(static_cast<base*>(0)));                           \
-   typedef thrust::detail::integral_constant<bool,value> type;                                               \
-};                                                                                                           \
-                                                                                                             \
-template<typename T, typename Result, typename Arg>                                                          \
-class trait_name<T, Result(Arg)>                                                                             \
-{                                                                                                            \
-   class yes { char m; };                                                                                    \
-   class no { yes m[2]; };                                                                                   \
-   struct base_mixin                                                                                         \
-   {                                                                                                         \
-     Result member_function_name(Arg);                                                                       \
-   };                                                                                                        \
-   struct base : public T, public base_mixin {};                                                             \
-   template <typename U, U t>  class helper{};                                                               \
-   template <typename U>                                                                                     \
-   static no deduce(U*, helper<Result (base_mixin::*)(Arg), &U::member_function_name>* = 0);                 \
-   static yes deduce(...);                                                                                   \
-public:                                                                                                      \
-   static const bool value = sizeof(yes) == sizeof(deduce(static_cast<base*>(0)));                           \
-   typedef thrust::detail::integral_constant<bool,value> type;                                               \
-};                                                                                                           \
-                                                                                                             \
-template<typename T, typename Result, typename Arg1, typename Arg2>                                          \
-class trait_name<T, Result(Arg1,Arg2)>                                                                       \
-{                                                                                                            \
-   class yes { char m; };                                                                                    \
-   class no { yes m[2]; };                                                                                   \
-   struct base_mixin                                                                                         \
-   {                                                                                                         \
-     Result member_function_name(Arg1,Arg2);                                                                 \
-   };                                                                                                        \
-   struct base : public T, public base_mixin {};                                                             \
-   template <typename U, U t>  class helper{};                                                               \
-   template <typename U>                                                                                     \
-   static no deduce(U*, helper<Result (base_mixin::*)(Arg1,Arg2), &U::member_function_name>* = 0);           \
-   static yes deduce(...);                                                                                   \
-public:                                                                                                      \
-   static const bool value = sizeof(yes) == sizeof(deduce(static_cast<base*>(0)));                           \
-   typedef thrust::detail::integral_constant<bool,value> type;                                               \
-};                                                                                                           \
-                                                                                                             \
-template<typename T, typename Result, typename Arg1, typename Arg2, typename Arg3>                           \
-class trait_name<T, Result(Arg1,Arg2,Arg3)>                                                                  \
-{                                                                                                            \
-   class yes { char m; };                                                                                    \
-   class no { yes m[2]; };                                                                                   \
-   struct base_mixin                                                                                         \
-   {                                                                                                         \
-     Result member_function_name(Arg1,Arg2,Arg3);                                                            \
-   };                                                                                                        \
-   struct base : public T, public base_mixin {};                                                             \
-   template <typename U, U t>  class helper{};                                                               \
-   template <typename U>                                                                                     \
-   static no deduce(U*, helper<Result (base_mixin::*)(Arg1,Arg2,Arg3), &U::member_function_name>* = 0);      \
-   static yes deduce(...);                                                                                   \
-public:                                                                                                      \
-   static const bool value = sizeof(yes) == sizeof(deduce(static_cast<base*>(0)));                           \
-   typedef thrust::detail::integral_constant<bool,value> type;                                               \
-};                                                                                                           \
-                                                                                                             \
-template<typename T, typename Result, typename Arg1, typename Arg2, typename Arg3, typename Arg4>            \
-class trait_name<T, Result(Arg1,Arg2,Arg3,Arg4)>                                                             \
-{                                                                                                            \
-   class yes { char m; };                                                                                    \
-   class no { yes m[2]; };                                                                                   \
-   struct base_mixin                                                                                         \
-   {                                                                                                         \
-     Result member_function_name(Arg1,Arg2,Arg3,Arg4);                                                       \
-   };                                                                                                        \
-   struct base : public T, public base_mixin {};                                                             \
-   template <typename U, U t>  class helper{};                                                               \
-   template <typename U>                                                                                     \
-   static no deduce(U*, helper<Result (base_mixin::*)(Arg1,Arg2,Arg3,Arg4), &U::member_function_name>* = 0); \
-   static yes deduce(...);                                                                                   \
-public:                                                                                                      \
-   static const bool value = sizeof(yes) == sizeof(deduce(static_cast<base*>(0)));                           \
-   typedef thrust::detail::integral_constant<bool,value> type;                                               \
-};                                                                                                           
+#include <utility> // for std::declval
 
+#define __THRUST_DEFINE_HAS_MEMBER_FUNCTION(trait_name, member_function_name)  \
+  template <typename T, typename Signature, typename = void>                   \
+  struct trait_name : thrust::false_type                                       \
+  {};                                                                          \
+                                                                               \
+  template <typename T, typename ResultT, typename... Args>                    \
+  struct trait_name<T,                                                         \
+                    ResultT(Args...),                                          \
+                    typename thrust::detail::enable_if<                        \
+                      thrust::detail::is_same<ResultT, void>::value ||         \
+                      thrust::detail::is_convertible<                          \
+                        ResultT,                                               \
+                        decltype(std::declval<T>().member_function_name(       \
+                          std::declval<Args>()...))>::value>::type>            \
+      : thrust::true_type                                                      \
+  {};
diff --git a/thrust/detail/type_traits/has_trivial_assign.h b/thrust/detail/type_traits/has_trivial_assign.h
index 01f26c7ef..7222ce593 100644
--- a/thrust/detail/type_traits/has_trivial_assign.h
+++ b/thrust/detail/type_traits/has_trivial_assign.h
@@ -25,30 +25,23 @@
 #include <thrust/detail/config.h>
 #include <thrust/detail/type_traits.h>
 
-namespace thrust
-{
+#include <cuda/std/type_traits>
+
+THRUST_NAMESPACE_BEGIN
 
 namespace detail
 {
 
-template<typename T> struct has_trivial_assign
+template<typename T> 
+struct has_trivial_assign
   : public integral_constant<
-      bool,
-      (is_pod<T>::value && !is_const<T>::value)
-#if THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC
-      || __has_trivial_assign(T)
-#elif THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC
-// only use the intrinsic for >= 4.3
-#if (__GNUC__ >= 4) && (__GNUC_MINOR__ >= 3)
-      || __has_trivial_assign(T)
-#endif // GCC VERSION
-#elif THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_CLANG
-      || __has_trivial_assign(T)
-#endif // THRUST_HOST_COMPILER
+      bool, 
+      (is_pod<T>::value && !is_const<T>::value) 
+      || ::cuda::std::is_trivially_copy_assignable<T>::value
     >
 {};
 
 } // end detail
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/type_traits/is_call_possible.h b/thrust/detail/type_traits/is_call_possible.h
index bff049377..58c1aca4d 100644
--- a/thrust/detail/type_traits/is_call_possible.h
+++ b/thrust/detail/type_traits/is_call_possible.h
@@ -16,14 +16,15 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/detail/type_traits.h>
 #include <thrust/detail/type_traits/has_member_function.h>
 
 // inspired by Roman Perepelitsa's presentation from comp.lang.c++.moderated
 // based on the implementation here: http://www.rsdn.ru/forum/cpp/2759773.1.aspx
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 namespace is_call_possible_detail
@@ -51,7 +52,7 @@ struct clone_constness<const src_type, dest_type>
 
 } // end is_call_possible_detail
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
 #define __THRUST_DEFINE_IS_CALL_POSSIBLE(trait_name, member_function_name)                                                                \
 __THRUST_DEFINE_HAS_MEMBER_FUNCTION(trait_name##_has_member, member_function_name)                                                        \
diff --git a/thrust/detail/type_traits/is_metafunction_defined.h b/thrust/detail/type_traits/is_metafunction_defined.h
index c278e5bdb..2c7a4be52 100644
--- a/thrust/detail/type_traits/is_metafunction_defined.h
+++ b/thrust/detail/type_traits/is_metafunction_defined.h
@@ -16,11 +16,12 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/detail/type_traits/has_nested_type.h>
 #include <thrust/detail/type_traits.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace detail
 {
@@ -37,5 +38,5 @@ template<typename Metafunction>
 
 } // end detail
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/type_traits/iterator/is_discard_iterator.h b/thrust/detail/type_traits/iterator/is_discard_iterator.h
index 0a5900de2..210409d62 100644
--- a/thrust/detail/type_traits/iterator/is_discard_iterator.h
+++ b/thrust/detail/type_traits/iterator/is_discard_iterator.h
@@ -20,8 +20,7 @@
 #include <thrust/detail/type_traits.h>
 #include <thrust/iterator/discard_iterator.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 
@@ -36,5 +35,5 @@ struct is_discard_iterator< thrust::discard_iterator<System> >
 {};
 
 } // end namespace detail
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/type_traits/iterator/is_output_iterator.h b/thrust/detail/type_traits/iterator/is_output_iterator.h
index d6801305b..555b67400 100644
--- a/thrust/detail/type_traits/iterator/is_output_iterator.h
+++ b/thrust/detail/type_traits/iterator/is_output_iterator.h
@@ -22,8 +22,7 @@
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/iterator/detail/any_assign.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace detail
 {
@@ -62,5 +61,5 @@ template<typename T>
 
 } // end detail
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/type_traits/minimum_type.h b/thrust/detail/type_traits/minimum_type.h
index 7e34f4f8a..2417e327d 100644
--- a/thrust/detail/type_traits/minimum_type.h
+++ b/thrust/detail/type_traits/minimum_type.h
@@ -16,10 +16,11 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/detail/type_traits.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace detail
 { 
@@ -56,8 +57,8 @@ struct primitive_minimum_type
   : minimum_type_detail::minimum_type_impl<
       T1,
       T2,
-      ::thrust::detail::is_convertible<T1,T2>::value,
-      ::thrust::detail::is_convertible<T2,T1>::value
+      THRUST_NS_QUALIFIER::detail::is_convertible<T1,T2>::value,
+      THRUST_NS_QUALIFIER::detail::is_convertible<T2,T1>::value
     >
 {
 }; // end primitive_minimum_type
@@ -158,5 +159,5 @@ template<typename T1,  typename T2,  typename T3,  typename T4,
 
 } // end detail
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/type_traits/pointer_traits.h b/thrust/detail/type_traits/pointer_traits.h
index 48ac7d6dc..90a8bc29d 100644
--- a/thrust/detail/type_traits/pointer_traits.h
+++ b/thrust/detail/type_traits/pointer_traits.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2018 NVIDIA Corporation
+ *  Copyright 2008-2020 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -22,9 +22,9 @@
 #include <thrust/detail/type_traits/has_nested_type.h>
 #include <thrust/iterator/iterator_traits.h>
 #include <cstddef>
+#include <type_traits>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 
@@ -83,34 +83,58 @@ template<typename Ptr, typename T> struct rebind_pointer;
 template<typename T, typename U>
   struct rebind_pointer<T*,U>
 {
-  typedef U* type;
+  using type = U*;
+};
+
+// Rebind generic fancy pointers.
+template<template<typename, typename...> class Ptr, typename OldT, typename... Tail, typename T>
+  struct rebind_pointer<Ptr<OldT,Tail...>,T>
+{
+  using type = Ptr<T,Tail...>;
 };
 
-template<template<typename> class Ptr, typename Arg, typename T>
-  struct rebind_pointer<Ptr<Arg>,T>
+// Rebind `thrust::pointer`-like things with `thrust::reference`-like references.
+template<template<typename, typename, typename, typename...> class Ptr, typename OldT, typename Tag,
+         template<typename...> class Ref, typename... RefTail,
+         typename... PtrTail, typename T>
+  struct rebind_pointer<Ptr<OldT,Tag,Ref<OldT,RefTail...>,PtrTail...>,T>
 {
-  typedef Ptr<T> type;
+//  static_assert(std::is_same<OldT, Tag>::value, "0");
+  using type = Ptr<T,Tag,Ref<T,RefTail...>,PtrTail...>;
 };
 
-template<template<typename, typename> class Ptr, typename Arg1, typename Arg2, typename T>
-  struct rebind_pointer<Ptr<Arg1,Arg2>,T>
+// Rebind `thrust::pointer`-like things with `thrust::reference`-like references
+// and templated derived types.
+template<template<typename, typename, typename, typename...> class Ptr, typename OldT, typename Tag,
+         template<typename...> class Ref, typename... RefTail,
+         template<typename...> class DerivedPtr, typename... DerivedPtrTail,
+         typename T>
+  struct rebind_pointer<Ptr<OldT,Tag,Ref<OldT,RefTail...>,DerivedPtr<OldT,DerivedPtrTail...>>,T>
 {
-  typedef Ptr<T,Arg2> type;
+//  static_assert(std::is_same<OldT, Tag>::value, "1");
+  using type = Ptr<T,Tag,Ref<T,RefTail...>,DerivedPtr<T,DerivedPtrTail...>>;
 };
 
-template<template<typename, typename, typename> class Ptr, typename Arg1, typename Arg2, typename Arg3, typename T>
-  struct rebind_pointer<Ptr<Arg1,Arg2,Arg3>,T>
+// Rebind `thrust::pointer`-like things with native reference types.
+template<template<typename, typename, typename, typename...> class Ptr, typename OldT, typename Tag,
+         typename... PtrTail, typename T>
+  struct rebind_pointer<Ptr<OldT,Tag,typename std::add_lvalue_reference<OldT>::type,PtrTail...>,T>
 {
-  typedef Ptr<T,Arg2,Arg3> type;
+//  static_assert(std::is_same<OldT, Tag>::value, "2");
+  using type = Ptr<T,Tag,typename std::add_lvalue_reference<T>::type,PtrTail...>;
 };
 
-template<template<typename, typename, typename, typename> class Ptr, typename Arg1, typename Arg2, typename Arg3, typename Arg4, typename T>
-  struct rebind_pointer<Ptr<Arg1,Arg2,Arg3,Arg4>,T>
+// Rebind `thrust::pointer`-like things with native reference types and templated
+// derived types.
+template<template<typename, typename, typename, typename...> class Ptr, typename OldT, typename Tag,
+         template<typename...> class DerivedPtr, typename... DerivedPtrTail,
+         typename T>
+  struct rebind_pointer<Ptr<OldT,Tag,typename std::add_lvalue_reference<OldT>::type,DerivedPtr<OldT,DerivedPtrTail...>>,T>
 {
-  typedef Ptr<T,Arg2,Arg3,Arg4> type;
+//  static_assert(std::is_same<OldT, Tag>::value, "3");
+  using type = Ptr<T,Tag,typename std::add_lvalue_reference<T>::type,DerivedPtr<T,DerivedPtrTail...>>;
 };
 
-// XXX this should probably be renamed native_type or similar
 __THRUST_DEFINE_HAS_NESTED_TYPE(has_raw_pointer, raw_pointer)
 
 namespace pointer_traits_detail
@@ -179,7 +203,7 @@ template<typename Ptr>
   typedef typename pointer_difference<Ptr>::type difference_type;
 
   template<typename U>
-    struct rebind 
+    struct rebind
   {
     typedef typename rebind_pointer<Ptr,U>::type other;
   };
@@ -189,7 +213,7 @@ template<typename Ptr>
   {
     // XXX this is supposed to be pointer::pointer_to(&r); (i.e., call a static member function of pointer called pointer_to)
     //     assume that pointer has a constructor from raw pointer instead
-    
+
     return pointer(&r);
   }
 
@@ -367,5 +391,5 @@ template<typename FromPtr, typename ToPtr, typename T = void>
 
 
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/type_traits/result_of_adaptable_function.h b/thrust/detail/type_traits/result_of_adaptable_function.h
index 5d862affd..edf797f14 100644
--- a/thrust/detail/type_traits/result_of_adaptable_function.h
+++ b/thrust/detail/type_traits/result_of_adaptable_function.h
@@ -20,46 +20,40 @@
 #include <thrust/detail/type_traits.h>
 #include <thrust/detail/type_traits/function_traits.h>
 
-#if __cplusplus >= 201103L || defined(__cpp_lib_result_of_sfinae)
-// necessary for std::result_of
 #include <type_traits>
-#endif
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 
-// In the C++11 mode, by default, result_of_adaptable function inheritfrom std::result_of
-#if __cplusplus >= 201103L || defined(__cpp_lib_result_of_sfinae)
+// Sets `type` to the result of the specified Signature invocation. If the
+// callable defines a `result_type` alias member, that type is used instead.
+// Use invoke_result / result_of when FuncType::result_type is not defined.
 template <typename Signature, typename Enable = void>
-struct result_of_adaptable_function : std::result_of<Signature> {};
-#else  /* cxx11 */
-template<typename Signature, typename Enable = void> 
-struct result_of_adaptable_function;
-#endif  /* cxx11 */
-
-// specialization for unary invocations of things which have result_type
-template<typename Functor, typename Arg1>
-  struct result_of_adaptable_function<
-    Functor(Arg1),
-    typename thrust::detail::enable_if<thrust::detail::has_result_type<Functor>::value>::type
-  >
+struct result_of_adaptable_function
 {
-  typedef typename Functor::result_type type;
-}; // end result_of
+private:
+  template <typename Sig> struct impl;
 
-// specialization for binary invocations of things which have result_type
-template<typename Functor, typename Arg1, typename Arg2>
-  struct result_of_adaptable_function<
-    Functor(Arg1,Arg2),
-    typename thrust::detail::enable_if<thrust::detail::has_result_type<Functor>::value>::type
-  >
-{
-  typedef typename Functor::result_type type;
-};
+  template <typename F, typename... Args>
+  struct impl<F(Args...)>
+  {
+    using type = invoke_result_t<F, Args...>;
+  };
 
+public:
+  using type = typename impl<Signature>::type;
+};
 
-} // end detail
-} // end thrust
+// specialization for invocations which define result_type
+template <typename Functor, typename... ArgTypes>
+struct result_of_adaptable_function<
+  Functor(ArgTypes...),
+  typename thrust::detail::enable_if<
+    thrust::detail::has_result_type<Functor>::value>::type>
+{
+  using type = typename Functor::result_type;
+};
 
+} // namespace detail
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/uninitialized_copy.inl b/thrust/detail/uninitialized_copy.inl
index 660df76d5..2778693ad 100644
--- a/thrust/detail/uninitialized_copy.inl
+++ b/thrust/detail/uninitialized_copy.inl
@@ -14,10 +14,9 @@
  *  limitations under the License.
  */
 
+#pragma once
 
-/*! \file uninitialized_copy.inl
- *  \brief Inline file for uninitialized_copy.h.
- */
+#include <thrust/detail/config.h>
 
 #include <thrust/uninitialized_copy.h>
 #include <thrust/iterator/iterator_traits.h>
@@ -25,8 +24,7 @@
 #include <thrust/system/detail/generic/uninitialized_copy.h>
 #include <thrust/system/detail/adl/uninitialized_copy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 
 __thrust_exec_check_disable__
@@ -92,6 +90,6 @@ template<typename InputIterator,
 } // end uninitialized_copy_n()
 
 
-} // end thrust
+THRUST_NAMESPACE_END
 
 
diff --git a/thrust/detail/uninitialized_fill.inl b/thrust/detail/uninitialized_fill.inl
index 30eab23a2..e013dac7b 100644
--- a/thrust/detail/uninitialized_fill.inl
+++ b/thrust/detail/uninitialized_fill.inl
@@ -14,10 +14,9 @@
  *  limitations under the License.
  */
 
+#pragma once
 
-/*! \file uninitialized_fill.inl
- *  \brief Inline file for uninitialized_fill.h.
- */
+#include <thrust/detail/config.h>
 
 #include <thrust/uninitialized_fill.h>
 #include <thrust/iterator/iterator_traits.h>
@@ -25,8 +24,7 @@
 #include <thrust/system/detail/generic/uninitialized_fill.h>
 #include <thrust/system/detail/adl/uninitialized_fill.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 
 __thrust_exec_check_disable__
@@ -88,5 +86,5 @@ template<typename ForwardIterator,
 } // end uninitialized_fill_n()
 
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/unique.inl b/thrust/detail/unique.inl
index b6fa9304d..ac5475f02 100644
--- a/thrust/detail/unique.inl
+++ b/thrust/detail/unique.inl
@@ -14,10 +14,7 @@
  *  limitations under the License.
  */
 
-
-/*! \file unique.inl
- *  \brief Inline file for unique.h.
- */
+#pragma once
 
 #include <thrust/detail/config.h>
 #include <thrust/unique.h>
@@ -28,8 +25,7 @@
 #include <thrust/system/detail/adl/unique.h>
 #include <thrust/system/detail/adl/unique_by_key.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 
 __thrust_exec_check_disable__
@@ -99,7 +95,7 @@ template<typename DerivedPolicy,
 __host__ __device__
   thrust::pair<ForwardIterator1,ForwardIterator2>
   unique_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                ForwardIterator1 keys_first, 
+                ForwardIterator1 keys_first,
                 ForwardIterator1 keys_last,
                 ForwardIterator2 values_first)
 {
@@ -116,7 +112,7 @@ template<typename DerivedPolicy,
 __host__ __device__
   thrust::pair<ForwardIterator1,ForwardIterator2>
   unique_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                ForwardIterator1 keys_first, 
+                ForwardIterator1 keys_first,
                 ForwardIterator1 keys_last,
                 ForwardIterator2 values_first,
                 BinaryPredicate binary_pred)
@@ -135,7 +131,7 @@ template<typename DerivedPolicy,
 __host__ __device__
   thrust::pair<OutputIterator1,OutputIterator2>
   unique_by_key_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                     InputIterator1 keys_first, 
+                     InputIterator1 keys_first,
                      InputIterator1 keys_last,
                      InputIterator2 values_first,
                      OutputIterator1 keys_output,
@@ -156,7 +152,7 @@ template<typename DerivedPolicy,
 __host__ __device__
   thrust::pair<OutputIterator1,OutputIterator2>
   unique_by_key_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                     InputIterator1 keys_first, 
+                     InputIterator1 keys_first,
                      InputIterator1 keys_last,
                      InputIterator2 values_first,
                      OutputIterator1 keys_output,
@@ -239,7 +235,7 @@ template<typename InputIterator,
 template<typename ForwardIterator1,
          typename ForwardIterator2>
   thrust::pair<ForwardIterator1,ForwardIterator2>
-    unique_by_key(ForwardIterator1 keys_first, 
+    unique_by_key(ForwardIterator1 keys_first,
                   ForwardIterator1 keys_last,
                   ForwardIterator2 values_first)
 {
@@ -259,7 +255,7 @@ template<typename ForwardIterator1,
          typename ForwardIterator2,
          typename BinaryPredicate>
   thrust::pair<ForwardIterator1,ForwardIterator2>
-    unique_by_key(ForwardIterator1 keys_first, 
+    unique_by_key(ForwardIterator1 keys_first,
                   ForwardIterator1 keys_last,
                   ForwardIterator2 values_first,
                   BinaryPredicate binary_pred)
@@ -281,7 +277,7 @@ template<typename InputIterator1,
          typename OutputIterator1,
          typename OutputIterator2>
   thrust::pair<OutputIterator1,OutputIterator2>
-    unique_by_key_copy(InputIterator1 keys_first, 
+    unique_by_key_copy(InputIterator1 keys_first,
                        InputIterator1 keys_last,
                        InputIterator2 values_first,
                        OutputIterator1 keys_output,
@@ -309,7 +305,7 @@ template<typename InputIterator1,
          typename OutputIterator2,
          typename BinaryPredicate>
   thrust::pair<OutputIterator1,OutputIterator2>
-    unique_by_key_copy(InputIterator1 keys_first, 
+    unique_by_key_copy(InputIterator1 keys_first,
                        InputIterator1 keys_last,
                        InputIterator2 values_first,
                        OutputIterator1 keys_output,
@@ -331,6 +327,67 @@ template<typename InputIterator1,
   return thrust::unique_by_key_copy(select_system(system1,system2,system3,system4), keys_first, keys_last, values_first, keys_output, values_output, binary_pred);
 } // end unique_by_key_copy()
 
+__thrust_exec_check_disable__
+template <typename DerivedPolicy,
+          typename ForwardIterator,
+          typename BinaryPredicate>
+__host__ __device__
+    typename thrust::iterator_traits<ForwardIterator>::difference_type
+    unique_count(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                 ForwardIterator first,
+                 ForwardIterator last,
+                 BinaryPredicate binary_pred)
+{
+  using thrust::system::detail::generic::unique_count;
+  return unique_count(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, binary_pred);
+} // end unique_count()
+
+__thrust_exec_check_disable__
+template <typename DerivedPolicy,
+          typename ForwardIterator>
+__host__ __device__
+    typename thrust::iterator_traits<ForwardIterator>::difference_type
+    unique_count(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                 ForwardIterator first,
+                 ForwardIterator last)
+{
+  using thrust::system::detail::generic::unique_count;
+  return unique_count(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last);
+} // end unique_count()
+
+__thrust_exec_check_disable__
+template <typename ForwardIterator,
+          typename BinaryPredicate>
+__host__ __device__
+    typename thrust::iterator_traits<ForwardIterator>::difference_type
+    unique_count(ForwardIterator first,
+                 ForwardIterator last,
+                 BinaryPredicate binary_pred)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<ForwardIterator>::type System;
+
+  System system;
+
+  return thrust::unique_count(select_system(system), first, last, binary_pred);
+} // end unique_count()
+
+__thrust_exec_check_disable__
+template <typename ForwardIterator>
+__host__ __device__
+    typename thrust::iterator_traits<ForwardIterator>::difference_type
+    unique_count(ForwardIterator first,
+                 ForwardIterator last)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<ForwardIterator>::type System;
+
+  System system;
+
+  return thrust::unique_count(select_system(system), first, last);
+} // end unique_count()
 
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/use_default.h b/thrust/detail/use_default.h
index ba2c27bc5..f25b6274c 100644
--- a/thrust/detail/use_default.h
+++ b/thrust/detail/use_default.h
@@ -18,10 +18,9 @@
 
 #include <thrust/detail/config.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 struct use_default {};
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/util/align.h b/thrust/detail/util/align.h
index af97cd44a..a3aa75bfe 100644
--- a/thrust/detail/util/align.h
+++ b/thrust/detail/util/align.h
@@ -17,12 +17,13 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/detail/cstdint.h>
 
 // functions to handle memory alignment
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 namespace util
@@ -55,5 +56,5 @@ bool is_aligned(T * ptr, detail::uintptr_t bytes = sizeof(T))
 
 } // end namespace util
 } // end namespace detail
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/vector_base.h b/thrust/detail/vector_base.h
index 49cd07070..0c4da449e 100644
--- a/thrust/detail/vector_base.h
+++ b/thrust/detail/vector_base.h
@@ -26,11 +26,13 @@
 #include <thrust/iterator/reverse_iterator.h>
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/detail/type_traits.h>
+#include <thrust/detail/config.h>
 #include <thrust/detail/contiguous_storage.h>
+
+#include <initializer_list>
 #include <vector>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace detail
 {
@@ -106,7 +108,7 @@ template<typename T, typename Alloc>
      */
     vector_base(const vector_base &v, const Alloc &alloc);
 
-  #if __cplusplus >= 201103L
+  #if THRUST_CPP_DIALECT >= 2011
     /*! Move constructor moves from another vector_base.
      *  \param v The vector_base to move.
      */
@@ -123,12 +125,26 @@ template<typename T, typename Alloc>
      */
     vector_base &operator=(const vector_base &v);
 
-  #if __cplusplus >= 201103L
     /*! Move assign operator moves from another vector_base.
      *  \param v The vector_base to move.
      */
     vector_base &operator=(vector_base &&v);
-  #endif
+
+    /*! This constructor builds a \p vector_base from an intializer_list.
+     *  \param il The intializer_list.
+     */
+    vector_base(std::initializer_list<T> il);
+      
+    /*! This constructor builds a \p vector_base from an intializer_list.
+     *  \param il The intializer_list.
+     *  \param alloc The allocator to use by this device_vector.
+     */
+    vector_base(std::initializer_list<T> il, const Alloc &alloc);
+    
+    /*! Assign operator copies from an initializer_list
+     *  \param il The initializer_list.
+     */
+    vector_base &operator=(std::initializer_list<T> il);
 
     /*! Copy constructor copies from an exemplar vector_base with different
      *  type.
@@ -206,11 +222,13 @@ template<typename T, typename Alloc>
 
     /*! Returns the number of elements in this vector_base.
      */
+    __host__ __device__
     size_type size(void) const;
 
     /*! Returns the size() of the largest possible vector_base.
      *  \return The largest possible return value of size().
      */
+    __host__ __device__
     size_type max_size(void) const;
 
     /*! \brief If n is less than or equal to capacity(), this call has no effect.
@@ -224,6 +242,7 @@ template<typename T, typename Alloc>
     /*! Returns the number of elements which have been reserved in this
      *  vector_base.
      */
+    __host__ __device__
     size_type capacity(void) const;
 
     /*! This method shrinks the capacity of this vector_base to exactly
@@ -239,6 +258,7 @@ template<typename T, typename Alloc>
      *  Note that data access with this operator is unchecked and
      *  out_of_range lookups are not defined.
      */
+    __host__ __device__
     reference operator[](size_type n);
 
     /*! \brief Subscript read access to the data contained in this vector_dev.
@@ -249,24 +269,28 @@ template<typename T, typename Alloc>
      *  Note that data access with this operator is unchecked and
      *  out_of_range lookups are not defined.
      */
+    __host__ __device__
     const_reference operator[](size_type n) const;
 
     /*! This method returns an iterator pointing to the beginning of
      *  this vector_base.
      *  \return mStart
      */
+    __host__ __device__
     iterator begin(void);
 
     /*! This method returns a const_iterator pointing to the beginning
      *  of this vector_base.
      *  \return mStart
      */
+    __host__ __device__
     const_iterator begin(void) const;
 
     /*! This method returns a const_iterator pointing to the beginning
      *  of this vector_base.
      *  \return mStart
      */
+    __host__ __device__
     const_iterator cbegin(void) const;
 
     /*! This method returns a reverse_iterator pointing to the beginning of
@@ -274,6 +298,7 @@ template<typename T, typename Alloc>
      *  \return A reverse_iterator pointing to the beginning of this
      *          vector_base's reversed sequence.
      */
+    __host__ __device__
     reverse_iterator rbegin(void);
 
     /*! This method returns a const_reverse_iterator pointing to the beginning of
@@ -281,6 +306,7 @@ template<typename T, typename Alloc>
      *  \return A const_reverse_iterator pointing to the beginning of this
      *          vector_base's reversed sequence.
      */
+    __host__ __device__
     const_reverse_iterator rbegin(void) const;
 
     /*! This method returns a const_reverse_iterator pointing to the beginning of
@@ -288,76 +314,89 @@ template<typename T, typename Alloc>
      *  \return A const_reverse_iterator pointing to the beginning of this
      *          vector_base's reversed sequence.
      */
+    __host__ __device__
     const_reverse_iterator crbegin(void) const;
 
     /*! This method returns an iterator pointing to one element past the
      *  last of this vector_base.
      *  \return begin() + size().
      */
+    __host__ __device__
     iterator end(void);
 
     /*! This method returns a const_iterator pointing to one element past the
      *  last of this vector_base.
      *  \return begin() + size().
      */
+    __host__ __device__
     const_iterator end(void) const;
 
     /*! This method returns a const_iterator pointing to one element past the
      *  last of this vector_base.
      *  \return begin() + size().
      */
+    __host__ __device__
     const_iterator cend(void) const;
 
     /*! This method returns a reverse_iterator pointing to one element past the
      *  last of this vector_base's reversed sequence.
      *  \return rbegin() + size().
      */
+    __host__ __device__
     reverse_iterator rend(void);
 
     /*! This method returns a const_reverse_iterator pointing to one element past the
      *  last of this vector_base's reversed sequence.
      *  \return rbegin() + size().
      */
+    __host__ __device__
     const_reverse_iterator rend(void) const;
 
     /*! This method returns a const_reverse_iterator pointing to one element past the
      *  last of this vector_base's reversed sequence.
      *  \return rbegin() + size().
      */
+    __host__ __device__
     const_reverse_iterator crend(void) const;
 
     /*! This method returns a const_reference referring to the first element of this
      *  vector_base.
      *  \return The first element of this vector_base.
      */
+    __host__ __device__
     const_reference front(void) const;
 
     /*! This method returns a reference pointing to the first element of this
      *  vector_base.
      *  \return The first element of this vector_base.
      */
+    __host__ __device__
     reference front(void);
 
     /*! This method returns a const reference pointing to the last element of
      *  this vector_base.
      *  \return The last element of this vector_base.
      */
+    __host__ __device__
     const_reference back(void) const;
 
     /*! This method returns a reference referring to the last element of
      *  this vector_dev.
      *  \return The last element of this vector_base.
      */
+    __host__ __device__
     reference back(void);
 
     /*! This method returns a pointer to this vector_base's first element.
      *  \return A pointer to the first element of this vector_base.
      */
+    __host__ __device__
     pointer data(void);
 
     /*! This method returns a const_pointer to this vector_base's first element.
      *  \return a const_pointer to the first element of this vector_base.
      */
+    __host__ __device__
     const_pointer data(void) const;
 
     /*! This method resizes this vector_base to 0.
@@ -367,6 +406,7 @@ template<typename T, typename Alloc>
     /*! This method returns true iff size() == 0.
      *  \return true if size() == 0; false, otherwise.
      */
+    __host__ __device__
     bool empty(void) const;
 
     /*! This method appends the given element to the end of this vector_base.
@@ -421,8 +461,8 @@ template<typename T, typename Alloc>
      *  \param first The beginning of the range to copy.
      *  \param last  The end of the range to copy.
      *
-     *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html>Input Iterator</a>,
-     *                        and \p InputIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>.
+     *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator>Input Iterator</a>,
+     *                        and \p InputIterator's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>.
      */
     template<typename InputIterator>
     void insert(iterator position, InputIterator first, InputIterator last);
@@ -438,7 +478,7 @@ template<typename T, typename Alloc>
      *  \param first The beginning of the range to copy.
      *  \param last  The end of the range to copy.
      *
-     *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator">Input Iterator</a>.
+     *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/named_req/InputIterator">Input Iterator</a>.
      */
     template<typename InputIterator>
     void assign(InputIterator first, InputIterator last);
@@ -581,7 +621,7 @@ template<typename T1, typename Alloc1,
 bool operator!=(const std::vector<T1,Alloc1>&         lhs,
                 const detail::vector_base<T2,Alloc2>& rhs);
 
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/vector_base.inl>
 
diff --git a/thrust/detail/vector_base.inl b/thrust/detail/vector_base.inl
index 1e8e2eec5..bdd6c1c7a 100644
--- a/thrust/detail/vector_base.inl
+++ b/thrust/detail/vector_base.inl
@@ -14,11 +14,9 @@
  *  limitations under the License.
  */
 
+#pragma once
 
-/*! \file vector_base.inl
- *  \brief Inline file for vector_base.h.
- */
-
+#include <thrust/detail/config.h>
 #include <thrust/detail/vector_base.h>
 #include <thrust/detail/copy.h>
 #include <thrust/detail/overlapped_copy.h>
@@ -32,8 +30,7 @@
 
 #include <stdexcept>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace detail
 {
@@ -110,7 +107,7 @@ template<typename T, typename Alloc>
   range_init(v.begin(), v.end());
 } // end vector_base::vector_base()
 
-#if __cplusplus >= 201103L
+#if THRUST_CPP_DIALECT >= 2011
   template<typename T, typename Alloc>
     vector_base<T,Alloc>
       ::vector_base(vector_base &&v)
@@ -139,7 +136,7 @@ template<typename T, typename Alloc>
   return *this;
 } // end vector_base::operator=()
 
-#if __cplusplus >= 201103L
+#if THRUST_CPP_DIALECT >= 2011
   template<typename T, typename Alloc>
     vector_base<T,Alloc> &
       vector_base<T,Alloc>
@@ -198,6 +195,34 @@ template<typename T, typename Alloc>
   return *this;
 } // end vector_base::operator=()
 
+  template<typename T, typename Alloc>
+    vector_base<T,Alloc>
+      ::vector_base(std::initializer_list<T> il)
+        :m_storage(),
+         m_size(0)
+  {
+    range_init(il.begin(), il.end());
+  } // end vector_base::vector_base()
+
+  template<typename T, typename Alloc>
+  vector_base<T,Alloc>
+    ::vector_base(std::initializer_list<T> il, const Alloc &alloc)
+    :m_storage(alloc),
+      m_size(0)
+  {
+    range_init(il.begin(), il.end());
+  } // end vector_base::vector_base()
+
+  template<typename T, typename Alloc>
+    vector_base<T,Alloc> &
+      vector_base<T,Alloc>
+      ::operator=(std::initializer_list<T> il)
+  {
+    assign(il.begin(), il.end());
+
+    return *this;
+  } // end vector_base::operator=()
+
 template<typename T, typename Alloc>
   template<typename IteratorOrIntegralType>
     void vector_base<T,Alloc>
@@ -342,6 +367,7 @@ template<typename T, typename Alloc>
 } // end vector_base::resize()
 
 template<typename T, typename Alloc>
+  __host__ __device__
   typename vector_base<T,Alloc>::size_type
     vector_base<T,Alloc>
       ::size(void) const
@@ -350,6 +376,7 @@ template<typename T, typename Alloc>
 } // end vector_base::size()
 
 template<typename T, typename Alloc>
+  __host__ __device__
   typename vector_base<T,Alloc>::size_type
     vector_base<T,Alloc>
       ::max_size(void) const
@@ -363,11 +390,43 @@ template<typename T, typename Alloc>
 {
   if(n > capacity())
   {
-    allocate_and_copy(n, begin(), end(), m_storage);
+    // compute the new capacity after the allocation
+    size_type new_capacity = n;
+
+    // do not exceed maximum storage
+    new_capacity = thrust::min THRUST_PREVENT_MACRO_SUBSTITUTION <size_type>(new_capacity, max_size());
+
+    // create new storage
+    storage_type new_storage(copy_allocator_t(), m_storage, new_capacity);
+
+    // record how many constructors we invoke in the try block below
+    iterator new_end = new_storage.begin();
+
+    try
+    {
+      // construct copy all elements into the newly allocated storage
+      new_end = m_storage.uninitialized_copy(begin(), end(), new_storage.begin());
+    } // end try
+    catch(...)
+    {
+      // something went wrong, so destroy & deallocate the new storage
+      new_storage.destroy(new_storage.begin(), new_end);
+      new_storage.deallocate();
+
+      // rethrow
+      throw;
+    } // end catch
+
+    // call destructors on the elements in the old storage
+    m_storage.destroy(begin(), end());
+
+    // record the vector's new state
+    m_storage.swap(new_storage);
   } // end if
 } // end vector_base::reserve()
 
 template<typename T, typename Alloc>
+  __host__ __device__
   typename vector_base<T,Alloc>::size_type
     vector_base<T,Alloc>
       ::capacity(void) const
@@ -384,6 +443,7 @@ template<typename T, typename Alloc>
 } // end vector_base::shrink_to_fit()
 
 template<typename T, typename Alloc>
+  __host__ __device__
   typename vector_base<T,Alloc>::reference
     vector_base<T,Alloc>
       ::operator[](const size_type n)
@@ -392,6 +452,7 @@ template<typename T, typename Alloc>
 } // end vector_base::operator[]
 
 template<typename T, typename Alloc>
+  __host__ __device__
   typename vector_base<T,Alloc>::const_reference
     vector_base<T,Alloc>
       ::operator[](const size_type n) const
@@ -400,6 +461,7 @@ template<typename T, typename Alloc>
 } // end vector_base::operator[]
 
 template<typename T, typename Alloc>
+  __host__ __device__
   typename vector_base<T,Alloc>::iterator
     vector_base<T,Alloc>
       ::begin(void)
@@ -408,6 +470,7 @@ template<typename T, typename Alloc>
 } // end vector_base::begin()
 
 template<typename T, typename Alloc>
+  __host__ __device__
   typename vector_base<T,Alloc>::const_iterator
     vector_base<T,Alloc>
       ::begin(void) const
@@ -416,6 +479,7 @@ template<typename T, typename Alloc>
 } // end vector_base::begin()
 
 template<typename T, typename Alloc>
+  __host__ __device__
   typename vector_base<T,Alloc>::const_iterator
     vector_base<T,Alloc>
       ::cbegin(void) const
@@ -424,6 +488,7 @@ template<typename T, typename Alloc>
 } // end vector_base::cbegin()
 
 template<typename T, typename Alloc>
+  __host__ __device__
   typename vector_base<T,Alloc>::reverse_iterator
     vector_base<T,Alloc>
       ::rbegin(void)
@@ -432,6 +497,7 @@ template<typename T, typename Alloc>
 } // end vector_base::rbegin()
 
 template<typename T, typename Alloc>
+  __host__ __device__
   typename vector_base<T,Alloc>::const_reverse_iterator
     vector_base<T,Alloc>
       ::rbegin(void) const
@@ -440,6 +506,7 @@ template<typename T, typename Alloc>
 } // end vector_base::rbegin()
 
 template<typename T, typename Alloc>
+  __host__ __device__
   typename vector_base<T,Alloc>::const_reverse_iterator
     vector_base<T,Alloc>
       ::crbegin(void) const
@@ -448,6 +515,7 @@ template<typename T, typename Alloc>
 } // end vector_base::crbegin()
 
 template<typename T, typename Alloc>
+  __host__ __device__
   typename vector_base<T,Alloc>::iterator
     vector_base<T,Alloc>
       ::end(void)
@@ -458,6 +526,7 @@ template<typename T, typename Alloc>
 } // end vector_base::end()
 
 template<typename T, typename Alloc>
+  __host__ __device__
   typename vector_base<T,Alloc>::const_iterator
     vector_base<T,Alloc>
       ::end(void) const
@@ -468,6 +537,7 @@ template<typename T, typename Alloc>
 } // end vector_base::end()
 
 template<typename T, typename Alloc>
+  __host__ __device__
   typename vector_base<T,Alloc>::const_iterator
     vector_base<T,Alloc>
       ::cend(void) const
@@ -476,6 +546,7 @@ template<typename T, typename Alloc>
 } // end vector_base::cend()
 
 template<typename T, typename Alloc>
+  __host__ __device__
   typename vector_base<T,Alloc>::reverse_iterator
     vector_base<T,Alloc>
       ::rend(void)
@@ -484,6 +555,7 @@ template<typename T, typename Alloc>
 } // end vector_base::rend()
 
 template<typename T, typename Alloc>
+  __host__ __device__
   typename vector_base<T,Alloc>::const_reverse_iterator
     vector_base<T,Alloc>
       ::rend(void) const
@@ -492,6 +564,7 @@ template<typename T, typename Alloc>
 } // end vector_base::rend()
 
 template<typename T, typename Alloc>
+  __host__ __device__
   typename vector_base<T,Alloc>::const_reverse_iterator
     vector_base<T,Alloc>
       ::crend(void) const
@@ -500,6 +573,7 @@ template<typename T, typename Alloc>
 } // end vector_base::crend()
 
 template<typename T, typename Alloc>
+  __host__ __device__
   typename vector_base<T,Alloc>::const_reference
     vector_base<T,Alloc>
       ::front(void) const
@@ -508,6 +582,7 @@ template<typename T, typename Alloc>
 } // end vector_base::front()
 
 template<typename T, typename Alloc>
+  __host__ __device__
   typename vector_base<T,Alloc>::reference
     vector_base<T,Alloc>
       ::front(void)
@@ -516,6 +591,7 @@ template<typename T, typename Alloc>
 } // end vector_base::front()
 
 template<typename T, typename Alloc>
+  __host__ __device__
   typename vector_base<T,Alloc>::const_reference
     vector_base<T,Alloc>
       ::back(void) const
@@ -526,6 +602,7 @@ template<typename T, typename Alloc>
 } // end vector_base::vector_base
 
 template<typename T, typename Alloc>
+  __host__ __device__
   typename vector_base<T,Alloc>::reference
     vector_base<T,Alloc>
       ::back(void)
@@ -536,19 +613,21 @@ template<typename T, typename Alloc>
 } // end vector_base::vector_base
 
 template<typename T, typename Alloc>
+  __host__ __device__
   typename vector_base<T,Alloc>::pointer
     vector_base<T,Alloc>
       ::data(void)
 {
-  return &front();
+  return pointer(&front());
 } // end vector_base::data()
 
 template<typename T, typename Alloc>
+  __host__ __device__
   typename vector_base<T,Alloc>::const_pointer
     vector_base<T,Alloc>
       ::data(void) const
 {
-  return &front();
+  return const_pointer(&front());
 } // end vector_base::data()
 
 template<typename T, typename Alloc>
@@ -556,7 +635,8 @@ template<typename T, typename Alloc>
     ::~vector_base(void)
 {
   // destroy every living thing
-  m_storage.destroy(begin(),end());
+  if (!empty())
+    m_storage.destroy(begin(),end());
 } // end vector_base::~vector_base()
 
 template<typename T, typename Alloc>
@@ -567,6 +647,7 @@ template<typename T, typename Alloc>
 } // end vector_base::~vector_dev()
 
 template<typename T, typename Alloc>
+  __host__ __device__
   bool vector_base<T,Alloc>
     ::empty(void) const
 {
@@ -875,13 +956,13 @@ template<typename T, typename Alloc>
         new_end = m_storage.uninitialized_copy(begin(), end(), new_storage.begin());
 
         // construct new elements to insert
-        m_storage.default_construct_n(new_end, n);
+        new_storage.default_construct_n(new_end, n);
         new_end += n;
       } // end try
       catch(...)
       {
         // something went wrong, so destroy & deallocate the new storage
-        m_storage.destroy(new_storage.begin(), new_end);
+        new_storage.destroy(new_storage.begin(), new_end);
         new_storage.deallocate();
 
         // rethrow
@@ -1028,7 +1109,7 @@ template<typename T, typename Alloc>
   {
     *current = *first;
   } // end for
-  
+
   // either just the input was exhausted or both
   // the input and vector elements were exhausted
   if(first == last)
@@ -1079,7 +1160,7 @@ template<typename T, typename Alloc>
   {
     // range fits inside allocated storage, but some elements
     // have not been constructed yet
-    
+
     // XXX TODO we could possibly implement this with one call
     // to transform rather than copy + uninitialized_copy
 
@@ -1161,7 +1242,7 @@ template<typename T, typename Alloc>
   } // end try
   catch(...)
   {
-    // something went wrong, so destroy & deallocate the new storage 
+    // something went wrong, so destroy & deallocate the new storage
     // XXX seems like this destroys too many elements -- should just be last - first instead of requested_size
     iterator new_storage_end = new_storage.begin();
     thrust::advance(new_storage_end, requested_size);
@@ -1187,7 +1268,7 @@ template<typename T, typename Alloc>
 
 namespace detail
 {
-    
+
 // iterator tags match
 template <typename InputIterator1, typename InputIterator2>
 bool vector_equal(InputIterator1 first1, InputIterator1 last1,
@@ -1243,7 +1324,7 @@ bool operator==(const detail::vector_base<T1,Alloc1>& lhs,
 {
     return lhs.size() == rhs.size() && detail::vector_equal(lhs.begin(), lhs.end(), rhs.begin());
 }
-    
+
 template<typename T1, typename Alloc1,
          typename T2, typename Alloc2>
 bool operator==(const detail::vector_base<T1,Alloc1>& lhs,
@@ -1267,7 +1348,7 @@ bool operator!=(const detail::vector_base<T1,Alloc1>& lhs,
 {
     return !(lhs == rhs);
 }
-    
+
 template<typename T1, typename Alloc1,
          typename T2, typename Alloc2>
 bool operator!=(const detail::vector_base<T1,Alloc1>& lhs,
@@ -1284,5 +1365,5 @@ bool operator!=(const std::vector<T1,Alloc1>&         lhs,
     return !(lhs == rhs);
 }
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/device_allocator.h b/thrust/device_allocator.h
index 2c4070ad9..f64c3854f 100644
--- a/thrust/device_allocator.h
+++ b/thrust/device_allocator.h
@@ -15,8 +15,9 @@
  */
 
 
-/*! \file device_allocator.h
- *  \brief An allocator which creates new elements in device memory
+/*! \file
+ *  \brief An allocator which creates new elements in memory accessible by
+ *  devices.
  */
 
 #pragma once
@@ -24,16 +25,15 @@
 #include <thrust/detail/config.h>
 #include <thrust/device_ptr.h>
 #include <thrust/mr/allocator.h>
-#include <thrust/memory/detail/device_system_resource.h>
+#include <thrust/mr/device_memory_resource.h>
 
 #include <limits>
 #include <stdexcept>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
-/** \addtogroup memory_resources Memory Resources
- *  \ingroup memory_management_classes
+/** \addtogroup allocators Allocators
+ *  \ingroup memory_management
  *  \{
  */
 
@@ -42,7 +42,7 @@ namespace thrust
  *      a \p device_ptr.
  */
 template<typename Upstream>
-class device_ptr_memory_resource THRUST_FINAL
+class device_ptr_memory_resource final
     : public thrust::mr::memory_resource<
         device_ptr<void>
     >
@@ -68,13 +68,13 @@ class device_ptr_memory_resource THRUST_FINAL
     }
 
     THRUST_NODISCARD __host__
-    virtual pointer do_allocate(std::size_t bytes, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT) THRUST_OVERRIDE
+    virtual pointer do_allocate(std::size_t bytes, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT) override
     {
         return pointer(m_upstream->do_allocate(bytes, alignment).get());
     }
 
     __host__
-    virtual void do_deallocate(pointer p, std::size_t bytes, std::size_t alignment) THRUST_OVERRIDE
+    virtual void do_deallocate(pointer p, std::size_t bytes, std::size_t alignment) override
     {
         m_upstream->do_deallocate(upstream_ptr(p.get()), bytes, alignment);
     }
@@ -83,13 +83,10 @@ class device_ptr_memory_resource THRUST_FINAL
     Upstream * m_upstream;
 };
 
-/*! \}
- */
-
-/*! \addtogroup memory_management Memory Management
- *  \addtogroup memory_management_classes Memory Management Classes
- *  \ingroup memory_management
- *  \{
+/*! \brief An allocator which creates new elements in memory accessible by
+ *         devices.
+ *
+ *  \see https://en.cppreference.com/w/cpp/named_req/Allocator
  */
 template<typename T>
 class device_allocator
@@ -118,25 +115,26 @@ class device_allocator
     };
 
     /*! Default constructor has no effect. */
-    __host__
+    __host__ __device__
     device_allocator() {}
 
     /*! Copy constructor has no effect. */
-    __host__
+    __host__ __device__
     device_allocator(const device_allocator& other) : base(other) {}
 
     /*! Constructor from other \p device_allocator has no effect. */
     template<typename U>
-    __host__
+    __host__ __device__
     device_allocator(const device_allocator<U>& other) : base(other) {}
 
+    device_allocator & operator=(const device_allocator &) = default;
+
     /*! Destructor has no effect. */
-    __host__
+    __host__ __device__
     ~device_allocator() {}
 };
 
-/*! \}
+/*! \} // allocators
  */
 
-} // end thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/device_delete.h b/thrust/device_delete.h
index ce822f09d..0811936fb 100644
--- a/thrust/device_delete.h
+++ b/thrust/device_delete.h
@@ -14,9 +14,8 @@
  *  limitations under the License.
  */
 
-
-/*! \file device_delete.h
- *  \brief Deletes variables in device memory
+/*! \file
+ *  \brief Deletes variables in device memory.
  */
 
 #pragma once
@@ -24,11 +23,9 @@
 #include <thrust/detail/config.h>
 #include <thrust/device_ptr.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
-/*! \addtogroup deallocation_functions Deallocation Functions
- *  \ingroup memory_management_functions
+/*! \addtogroup memory_management Memory Management
  *  \{
  */
 
@@ -47,10 +44,10 @@ template<typename T>
   inline void device_delete(thrust::device_ptr<T> ptr,
                             const size_t n = 1);
 
-/*! \}
+/*! \} // memory_management
  */
 
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/device_delete.inl>
 
diff --git a/thrust/device_free.h b/thrust/device_free.h
index 38d4424c7..1cd305045 100644
--- a/thrust/device_free.h
+++ b/thrust/device_free.h
@@ -14,9 +14,8 @@
  *  limitations under the License.
  */
 
-
-/*! \file device_free.h
- *  \brief Deallocates storage allocated by \p device_malloc
+/*! \file 
+ *  \brief Deallocates storage allocated by \p device_malloc.
  */
 
 #pragma once
@@ -24,11 +23,9 @@
 #include <thrust/detail/config.h>
 #include <thrust/device_ptr.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
-/*! \addtogroup deallocation_functions Deallocation Functions
- *  \ingroup memory_management_functions
+/*! \addtogroup memory_management Memory Management
  *  \{
  */
 
@@ -59,10 +56,10 @@ namespace thrust
  */
 inline void device_free(thrust::device_ptr<void> ptr);
 
-/*! \}
+/*! \} // memory_management
  */
 
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/device_free.inl>
 
diff --git a/thrust/device_make_unique.h b/thrust/device_make_unique.h
index cb7e7c3b9..cdb8c31d8 100644
--- a/thrust/device_make_unique.h
+++ b/thrust/device_make_unique.h
@@ -32,27 +32,29 @@
 #include <thrust/device_allocator.h>
 #include <thrust/detail/type_deduction.h>
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
 
 ///////////////////////////////////////////////////////////////////////////////
 
 template <typename T, typename... Args>
 __host__
 auto device_make_unique(Args&&... args)
-  -> decltype(
+  THRUST_TRAILING_RETURN(decltype(
     uninitialized_allocate_unique<T>(device_allocator<T>{})
-  )
+  ))
 {
-  // FIXME: This is crude - we construct an unnecessary T on the host for 
+#if !defined(THRUST_DOXYGEN) // This causes Doxygen to choke for some reason.
+  // FIXME: This is crude - we construct an unnecessary T on the host for
   // `device_new`. We need a proper dispatched `construct` algorithm to
   // do this properly.
   auto p = uninitialized_allocate_unique<T>(device_allocator<T>{});
   device_new<T>(p.get(), T(THRUST_FWD(args)...));
   return p;
+#endif
 }
 
 ///////////////////////////////////////////////////////////////////////////////
 
-THRUST_END_NS
+THRUST_NAMESPACE_END
 
 #endif // THRUST_CPP_DIALECT >= 2011
diff --git a/thrust/device_malloc.h b/thrust/device_malloc.h
index 75194491e..790ddbac3 100644
--- a/thrust/device_malloc.h
+++ b/thrust/device_malloc.h
@@ -14,9 +14,8 @@
  *  limitations under the License.
  */
 
-
-/*! \file device_malloc.h
- *  \brief Allocates storage in device memory
+/*! \file
+ *  \brief Allocates storage in device memory.
  */
 
 #pragma once
@@ -25,11 +24,9 @@
 #include <thrust/device_ptr.h>
 #include <cstddef> // for std::size_t
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
-/*! \addtogroup allocation_functions Allocation Functions
- *  \ingroup memory_management_functions
+/*! \addtogroup memory_management Memory Management
  *  \{
  */
 
@@ -94,10 +91,10 @@ inline thrust::device_ptr<void> device_malloc(const std::size_t n);
 template<typename T>
   inline thrust::device_ptr<T> device_malloc(const std::size_t n);
 
-/*! \}
+/*! \} // memory_management
  */
 
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/device_malloc.inl>
 
diff --git a/thrust/device_malloc_allocator.h b/thrust/device_malloc_allocator.h
index 319564e56..1b15045f2 100644
--- a/thrust/device_malloc_allocator.h
+++ b/thrust/device_malloc_allocator.h
@@ -14,9 +14,8 @@
  *  limitations under the License.
  */
 
-
-/*! \file device_malloc_allocator.h
- *  \brief An allocator which allocates storage with \p device_malloc
+/*! \file 
+ *  \brief An allocator which allocates storage with \p device_malloc.
  */
 
 #pragma once
@@ -29,15 +28,13 @@
 #include <limits>
 #include <stdexcept>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 // forward declarations to WAR circular #includes
 template<typename> class device_ptr;
 template<typename T> device_ptr<T> device_malloc(const std::size_t n);
 
-/*! \addtogroup memory_management Memory Management
- *  \addtogroup memory_management_classes Memory Management Classes
+/*! \addtogroup allocators Allocators 
  *  \ingroup memory_management
  *  \{
  */
@@ -51,7 +48,7 @@ template<typename T> device_ptr<T> device_malloc(const std::size_t n);
  *  \see device_malloc
  *  \see device_ptr
  *  \see device_allocator
- *  \see http://www.sgi.com/tech/stl/Allocators.html
+ *  \see https://en.cppreference.com/w/cpp/memory/allocator
  */
 template<typename T>
   class device_malloc_allocator
@@ -108,12 +105,16 @@ template<typename T>
     __host__ __device__
     inline device_malloc_allocator(device_malloc_allocator<U> const&) {}
 
+#if THRUST_CPP_DIALECT >= 2011
+    device_malloc_allocator & operator=(const device_malloc_allocator &) = default;
+#endif
+
     /*! Returns the address of an allocated object.
      *  \return <tt>&r</tt>.
      */
     __host__ __device__
     inline pointer address(reference r) { return &r; }
-    
+
     /*! Returns the address an allocated object.
      *  \return <tt>&r</tt>.
      */
@@ -173,9 +174,7 @@ template<typename T>
     inline bool operator!=(device_malloc_allocator const &a) const {return !operator==(a); }
 }; // end device_malloc_allocator
 
-/*! \}
+/*! \} // allocators
  */
 
-} // end thrust
-
-
+THRUST_NAMESPACE_END
diff --git a/thrust/device_new.h b/thrust/device_new.h
index 1ae4ce5a4..c615e58f2 100644
--- a/thrust/device_new.h
+++ b/thrust/device_new.h
@@ -27,11 +27,10 @@
 #include <cstddef>
 #include <thrust/device_ptr.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 /*!
- *  \addtogroup allocation_functions Allocation Functions
+ *  \addtogroup memory_management Memory Management
  *  \{
  */
 
@@ -79,10 +78,9 @@ template <typename T>
 template <typename T>
   device_ptr<T> device_new(const size_t n = 1);
 
-/*! \}
+/*! \} // memory_management
  */
 
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/device_new.inl>
-
diff --git a/thrust/device_new_allocator.h b/thrust/device_new_allocator.h
index 6182306fb..c9c6b0e95 100644
--- a/thrust/device_new_allocator.h
+++ b/thrust/device_new_allocator.h
@@ -14,9 +14,8 @@
  *  limitations under the License.
  */
 
-
-/*! \file device_new_allocator.h
- *  \brief An allocator which allocates storage with \p device_new
+/*! \file 
+ *  \brief An allocator which allocates storage with \p device_new.
  */
 
 #pragma once
@@ -26,13 +25,15 @@
 #include <thrust/device_reference.h>
 #include <thrust/device_new.h>
 #include <thrust/device_delete.h>
-#include <limits>
+
+#include <cuda/std/cstdint>
+#include <cuda/std/limits>
+
 #include <stdexcept>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
-/*! \addtogroup memory_management_classes Memory Management Classes
+/*! \addtogroup allocators Allocators
  *  \ingroup memory_management
  *  \{
  */
@@ -42,7 +43,7 @@ namespace thrust
  *
  *  \see device_new
  *  \see device_ptr
- *  \see http://www.sgi.com/tech/stl/Allocators.html
+ *  \see https://en.cppreference.com/w/cpp/memory/allocator
  */
 template<typename T>
   class device_new_allocator
@@ -63,8 +64,8 @@ template<typename T>
     /*! \c const reference to allocated element, \c device_reference<const T>. */
     typedef device_reference<const T>         const_reference;
 
-    /*! Type of allocation size, \c std::size_t. */
-    typedef std::size_t                       size_type;
+    /*! Type of allocation size, \c ::cuda::std::size_t. */
+    typedef ::cuda::std::size_t                 size_type;
 
     /*! Type of allocation difference, \c pointer::difference_type. */
     typedef typename pointer::difference_type difference_type;
@@ -139,6 +140,7 @@ template<typename T>
     inline void deallocate(pointer p, size_type cnt)
     {
       // use "::operator delete" rather than keyword delete
+      (void)cnt;
       device_delete(p);
     } // end deallocate()
 
@@ -148,7 +150,7 @@ template<typename T>
     __host__ __device__
     inline size_type max_size() const
     {
-      return std::numeric_limits<size_type>::max THRUST_PREVENT_MACRO_SUBSTITUTION () / sizeof(T);
+      return ::cuda::std::numeric_limits<size_type>::max THRUST_PREVENT_MACRO_SUBSTITUTION () / sizeof(T);
     } // end max_size()
 
     /*! Compares against another \p device_malloc_allocator for equality.
@@ -164,8 +166,7 @@ template<typename T>
     inline bool operator!=(device_new_allocator const &a) {return !operator==(a); }
 }; // end device_new_allocator
 
-/*! \}
+/*! \} // allocators
  */
 
-} // end thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/device_ptr.h b/thrust/device_ptr.h
index fb3ad1ee0..5ef4aa464 100644
--- a/thrust/device_ptr.h
+++ b/thrust/device_ptr.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2021 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -14,9 +14,9 @@
  *  limitations under the License.
  */
 
-
-/*! \file device_ptr.h
- *  \brief A pointer to a variable which resides in the "device" system's memory space
+/*! \file
+ *  \brief A pointer to an object which resides in memory associated with the
+ *  \c device system.
  */
 
 #pragma once
@@ -24,169 +24,188 @@
 #include <thrust/detail/config.h>
 #include <thrust/memory.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup memory_management Memory Management
- *  \addtogroup memory_management_classes Memory Management Classes
- *  \ingroup memory_management
  *  \{
  */
 
-// forward declarations
-template<typename T> class device_reference;
+template <typename T> class device_reference;
 
-/*! \p device_ptr stores a pointer to an object allocated in device memory. This type
- *  provides type safety when dispatching standard algorithms on ranges resident in
- *  device memory.
+/*! \brief \c device_ptr is a pointer-like object which points to an object that
+ *  resides in memory associated with the \ref device system.
  *
- *  \p device_ptr has pointer semantics: it may be dereferenced safely from the host and
- *  may be manipulated with pointer arithmetic.
+ *  \c device_ptr has pointer semantics: it may be dereferenced safely from
+ *  anywhere, including the \ref host, and may be manipulated with pointer
+ *  arithmetic.
  *
- *  \p device_ptr can be created with the functions device_malloc, device_new, or
- *  device_pointer_cast, or by explicitly calling its constructor with a raw pointer.
+ *  \c device_ptr can be created with \ref device_new, \ref device_malloc,
+ *  \ref device_malloc_allocator, \ref device_allocator, or
+ *  \ref device_pointer_cast, or by explicitly calling its constructor with a
+ *  raw pointer.
  *
- *  The raw pointer encapsulated by a \p device_ptr may be obtained by either its <tt>get</tt>
- *  method or the \p raw_pointer_cast free function.
+ *  The raw pointer contained in a \c device_ptr may be obtained via \c get
+ *  member function or the \ref raw_pointer_cast free function.
  *
- *  \note \p device_ptr is not a smart pointer; it is the programmer's responsibility to
- *  deallocate memory pointed to by \p device_ptr.
+ *  \ref algorithms operating on \c device_ptr types will automatically be
+ *  dispatched to the \ref device system.
+ *
+ *  \note \c device_ptr is not a smart pointer; it is the programmer's
+ *  responsibility to deallocate memory pointed to by \c device_ptr.
  *
- *  \see device_malloc
  *  \see device_new
+ *  \see device_malloc
+ *  \see device_malloc_allocator
+ *  \see device_allocator
  *  \see device_pointer_cast
  *  \see raw_pointer_cast
  */
-template<typename T>
-  class device_ptr
-    : public thrust::pointer<
-               T,
-               thrust::device_system_tag,
-               thrust::device_reference<T>,
-               thrust::device_ptr<T>
-             >
+template <typename T>
+class device_ptr
+  : public thrust::pointer<
+      T,
+      thrust::device_system_tag,
+      thrust::device_reference<T>,
+      thrust::device_ptr<T>
+    >
 {
   private:
-    typedef thrust::pointer<
+    using super_t = thrust::pointer<
       T,
       thrust::device_system_tag,
       thrust::device_reference<T>,
       thrust::device_ptr<T>
-    > super_t;
+    >;
 
   public:
-    /*! \p device_ptr's null constructor initializes its raw pointer to \c 0.
+    /*! \brief Construct a null \c device_ptr.
+     *
+     *  \post <tt>get() == nullptr</tt>.
      */
     __host__ __device__
     device_ptr() : super_t() {}
 
-    #if THRUST_CPP_DIALECT >= 2011
-    // NOTE: This is needed so that Thrust smart pointers can be used in
-    // `std::unique_ptr`.
+    /*! \brief Construct a null \c device_ptr.
+     *
+     *  \param ptr A null pointer.
+     *
+     *  \post <tt>get() == nullptr</tt>.
+     */
     __host__ __device__
-    device_ptr(decltype(nullptr)) : super_t(nullptr) {}
-    #endif
-
-    /*! \p device_ptr's copy constructor is templated to allow copying to a
-     *  <tt>device_ptr<const T></tt> from a <tt>T *</tt>.
-     *  
-     *  \param ptr A raw pointer to copy from, presumed to point to a location in
-     *         device memory.
+    device_ptr(std::nullptr_t) : super_t(nullptr) {}
+
+    /*! \brief Construct a \c device_ptr from a raw pointer which is
+     *  convertible to \c T*.
+     *
+     *  \tparam U   A type whose pointer is convertible to \c T*.
+     *  \param  ptr A raw pointer to a \c U in device memory to construct from.
+     *
+     *  \pre <tt>std::is_convertible_v<U*, T*> == true</tt>.
+     *
+     *  \pre \c ptr points to a location in device memory.
+     *
+     *  \post <tt>get() == nullptr</tt>.
      */
-    template<typename OtherT>
+    template <typename U>
     __host__ __device__
-    explicit device_ptr(OtherT *ptr) : super_t(ptr) {}
-
-    /*! \p device_ptr's copy constructor allows copying from another device_ptr with related type.
-     *  \param other The \p device_ptr to copy from.
+    explicit device_ptr(U* ptr) : super_t(ptr) {}
+
+    /*! \brief Copy construct a \c device_ptr from another \c device_ptr whose
+     *  pointer type is convertible to \c T*.
+     *
+     *  \tparam U     A type whose pointer is convertible to \c T*.
+     *  \param  other A \c device_ptr to a \c U to construct from.
+     *
+     *  \pre <tt>std::is_convertible_v<U*, T*> == true</tt>.
+     *
+     *  \post <tt>get() == other.get()</tt>.
      */
-    template<typename OtherT>
+    template <typename U>
     __host__ __device__
-    device_ptr(const device_ptr<OtherT> &other) : super_t(other) {}
-
-    /*! \p device_ptr's assignment operator allows assigning from another \p device_ptr with related type.
-     *  \param other The other \p device_ptr to copy from.
-     *  \return <tt>*this</tt>
+    device_ptr(device_ptr<U> const& other) : super_t(other) {}
+
+    /*! \brief Set this \c device_ptr to point to the same object as another
+     *  \c device_ptr whose pointer type is convertible to \c T*.
+     *
+     *  \tparam U     A type whose pointer is convertible to \c T*.
+     *  \param  other A \c device_ptr to a \c U to assign from.
+     *
+     *  \pre <tt>std::is_convertible_v<U*, T*> == true</tt>.
+     *
+     *  \post <tt>get() == other.get()</tt>.
+     *
+     *  \return \c *this.
      */
-    template<typename OtherT>
+    template <typename U>
     __host__ __device__
-    device_ptr &operator=(const device_ptr<OtherT> &other)
+    device_ptr &operator=(device_ptr<U> const& other)
     {
       super_t::operator=(other);
       return *this;
     }
 
-    #if THRUST_CPP_DIALECT >= 2011
-    // NOTE: This is needed so that Thrust smart pointers can be used in
-    // `std::unique_ptr`.
+    /*! \brief Set this \c device_ptr to null.
+     *
+     *  \param ptr A null pointer.
+     *
+     *  \post <tt>get() == nullptr</tt>.
+     *
+     *  \return \c *this.
+     */
     __host__ __device__
-    device_ptr& operator=(decltype(nullptr))
+    device_ptr& operator=(std::nullptr_t)
     {
       super_t::operator=(nullptr);
       return *this;
     }
-    #endif
 
-// declare these members for the purpose of Doxygenating them
-// they actually exist in a derived-from class
-#if 0
-    /*! This method returns this \p device_ptr's raw pointer.
-     *  \return This \p device_ptr's raw pointer.
+#if THRUST_DOXYGEN
+    /*! \brief Return the raw pointer that this \c device_ptr points to.
      */
     __host__ __device__
-    T *get(void) const;
-#endif // end doxygen-only members
-}; // end device_ptr
-
-// declare these methods for the purpose of Doxygenating them
-// they actually are defined for a derived-from class
-#if 0
-/*! Writes to an output stream the value of a \p device_ptr's raw pointer.
+    T* get() const;
+#endif
+};
+
+#if THRUST_DOXYGEN
+/*! Write the address that a \c device_ptr points to to an output stream.
  *
  *  \param os The output stream.
- *  \param p The \p device_ptr to output.
- *  \return os.
+ *  \param dp The \c device_ptr to output.
+ *
+ *  \return \c os.
  */
-template<typename T, typename charT, typename traits>
-std::basic_ostream<charT, traits> &
-operator<<(std::basic_ostream<charT, traits> &os, const device_ptr<T> &p);
+template <typename T, typename CharT, typename Traits>
+__host__ std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, device_ptr<T> const& dp);
 #endif
 
-/*! \}
- */
-
-
-/*!
- *  \addtogroup memory_management_functions Memory Management Functions
- *  \ingroup memory_management
- *  \{
- */
-
-/*! \p device_pointer_cast creates a device_ptr from a raw pointer which is presumed to point
- *  to a location in device memory.
+/*! \brief Create a \c device_ptr from a raw pointer.
+ *
+ *  \tparam T   Any type.
+ *  \param  ptr A raw pointer to a \c T in device memory.
  *
- *  \param ptr A raw pointer, presumed to point to a location in device memory.
- *  \return A device_ptr wrapping ptr.
+ *  \pre \c ptr points to a location in device memory.
+ *
+ *  \return A \c device_ptr<T> pointing to \c ptr.
  */
-template<typename T>
+template <typename T>
 __host__ __device__
-inline device_ptr<T> device_pointer_cast(T *ptr);
+device_ptr<T> device_pointer_cast(T* ptr);
 
-/*! This version of \p device_pointer_cast creates a copy of a device_ptr from another device_ptr.
- *  This version is included for symmetry with \p raw_pointer_cast.
+/*! \brief Create a \c device_ptr from another \c device_ptr.
  *
- *  \param ptr A device_ptr.
- *  \return A copy of \p ptr.
+ *  \tparam T    Any type.
+ *  \param  dptr A \c device_ptr to a \c T.
  */
 template<typename T>
 __host__ __device__
-inline device_ptr<T> device_pointer_cast(const device_ptr<T> &ptr);
+device_ptr<T> device_pointer_cast(device_ptr<T> const& dptr);
 
-/*! \}
+/*! \} // memory_management
  */
 
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/device_ptr.inl>
 #include <thrust/detail/raw_pointer_cast.h>
-
diff --git a/thrust/device_reference.h b/thrust/device_reference.h
index 6d8538b2f..512ab4c60 100644
--- a/thrust/device_reference.h
+++ b/thrust/device_reference.h
@@ -14,9 +14,9 @@
  *  limitations under the License.
  */
 
-
-/*! \file device_reference.h
- *  \brief A reference to a variable which resides in the "device" system's memory space
+/*! \file 
+ *  \brief A reference to an object which resides in memory associated with the
+ *  device system.
  */
 
 #pragma once
@@ -26,11 +26,9 @@
 #include <thrust/detail/type_traits.h>
 #include <thrust/detail/reference.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
-/*! \addtogroup memory_management_classes Memory Management Classes
- *  \ingroup memory_management
+/*! \addtogroup memory_management Memory Management
  *  \{
  */
 
@@ -38,7 +36,7 @@ namespace thrust
  *  \p device_reference is not intended to be used directly; rather, this type
  *  is the result of deferencing a \p device_ptr. Similarly, taking the address of
  *  a \p device_reference yields a \p device_ptr.
- *  
+ *
  *  \p device_reference may often be used from host code in place of operations defined on
  *  its associated \c value_type. For example, when \p device_reference refers to an
  *  arithmetic type, arithmetic operations on it are legal:
@@ -158,7 +156,7 @@ namespace thrust
  *    return 0;
  *  }
  *  \endcode
- *  
+ *
  *  Another common case where a \p device_reference cannot directly be used in place of
  *  its referent object occurs when passing them as parameters to functions like \c printf
  *  which have varargs parameters. Because varargs parameters must be Plain Old Data, a
@@ -209,7 +207,7 @@ template<typename T>
     /*! This copy constructor accepts a const reference to another
      *  \p device_reference. After this \p device_reference is constructed,
      *  it shall refer to the same object as \p other.
-     *  
+     *
      *  \param other A \p device_reference to copy from.
      *
      *  The following code snippet demonstrates the semantics of this
@@ -233,7 +231,7 @@ template<typename T>
      *  assert(ref == 13);
      *  \endcode
      *
-     *  \note This constructor is templated primarily to allow initialization of 
+     *  \note This constructor is templated primarily to allow initialization of
      *  <tt>device_reference<const T></tt> from <tt>device_reference<T></tt>.
      */
     template<typename OtherT>
@@ -289,16 +287,22 @@ template<typename T>
      */
     template<typename OtherT>
     __host__ __device__
-    device_reference &operator=(const device_reference<OtherT> &other);
+    device_reference &operator=(const device_reference<OtherT> &other)
+    {
+      return super_t::operator=(other);
+    }
 
     /*! Assignment operator assigns the value of the given value to the
      *  value referenced by this \p device_reference.
-     *  
+     *
      *  \param x The value to assign from.
      *  \return <tt>*this</tt>
      */
     __host__ __device__
-    device_reference &operator=(const value_type &x);
+    device_reference &operator=(const value_type &x)
+    {
+      return super_t::operator=(x);
+    }
 
 // declare these members for the purpose of Doxygenating them
 // they actually exist in a derived-from class
@@ -332,7 +336,7 @@ template<typename T>
      *  \p device_reference.
      *
      *  \return <tt>*this</tt>
-     *  
+     *
      *  The following code snippet demonstrates the semantics of
      *  \p device_reference's prefix increment operator.
      *
@@ -467,7 +471,7 @@ template<typename T>
      *  \p device_reference.
      *
      *  \return <tt>*this</tt>
-     *  
+     *
      *  The following code snippet demonstrates the semantics of
      *  \p device_reference's prefix decrement operator.
      *
@@ -958,11 +962,14 @@ template<typename T>
  */
 template<typename T>
 __host__ __device__
-void swap(device_reference<T> x, device_reference<T> y);
+void swap(device_reference<T>& x, device_reference<T>& y)
+{
+  x.swap(y);
+}
 
 // declare these methods for the purpose of Doxygenating them
 // they actually are defined for a derived-from class
-#if 0
+#if THRUST_DOXYGEN
 /*! Writes to an output stream the value of a \p device_reference.
  *
  *  \param os The output stream.
@@ -974,10 +981,7 @@ std::basic_ostream<charT, traits> &
 operator<<(std::basic_ostream<charT, traits> &os, const device_reference<T> &y);
 #endif
 
-/*! \}
+/*! \} // memory_management
  */
 
-} // end thrust
-
-#include <thrust/detail/device_reference.inl>
-
+THRUST_NAMESPACE_END
diff --git a/thrust/device_vector.h b/thrust/device_vector.h
index 42d59bd9c..9b97e8d70 100644
--- a/thrust/device_vector.h
+++ b/thrust/device_vector.h
@@ -15,8 +15,9 @@
  */
 
 
-/*! \file device_vector.h
- *  \brief A dynamically-sizable array of elements which reside in the "device" memory space
+/*! \file
+ *  \brief A dynamically-sizable array of elements which resides in memory
+ *         accessible to devices.
  */
 
 #pragma once
@@ -25,18 +26,13 @@
 #include <thrust/detail/vector_base.h>
 #include <thrust/device_allocator.h>
 
+#include <initializer_list>
 #include <vector>
 #include <utility>
 
-namespace thrust
-{
-
-// forward declaration of host_vector
-template<typename T, typename Alloc> class host_vector;
+THRUST_NAMESPACE_BEGIN
 
-/*! \addtogroup container_classes Container Classes
- *  \addtogroup device_containers Device Containers
- *  \ingroup container_classes
+/*! \addtogroup containers Containers
  *  \{
  */
 
@@ -44,12 +40,13 @@ template<typename T, typename Alloc> class host_vector;
  *  constant time removal of elements at the end, and linear time insertion
  *  and removal of elements at the beginning or in the middle. The number of
  *  elements in a \p device_vector may vary dynamically; memory management is
- *  automatic. The memory associated with a \p device_vector resides in the memory
- *  space of a parallel device.
+ *  automatic. The memory associated with a \p device_vector resides in the
+ *  memory accessible to devices.
  *
- *  \see http://www.sgi.com/tech/stl/Vector.html
+ *  \see https://en.cppreference.com/w/cpp/container/vector
  *  \see device_allocator
  *  \see host_vector
+ *  \see universal_vector
  */
 template<typename T, typename Alloc = thrust::device_allocator<T> >
   class device_vector
@@ -68,14 +65,12 @@ template<typename T, typename Alloc = thrust::device_allocator<T> >
 
     /*! This constructor creates an empty \p device_vector.
      */
-    __host__
     device_vector(void)
       :Parent() {}
 
     /*! This constructor creates an empty \p device_vector.
      *  \param alloc The allocator to use by this device_vector.
      */
-    __host__
     device_vector(const Alloc &alloc)
       :Parent(alloc) {}
 
@@ -83,14 +78,12 @@ template<typename T, typename Alloc = thrust::device_allocator<T> >
      */
     //  Define an empty destructor to explicitly specify
     //  its execution space qualifier, as a workaround for nvcc warning
-    __host__
     ~device_vector(void) {}
 
     /*! This constructor creates a \p device_vector with the given
      *  size.
      *  \param n The number of elements to initially create.
      */
-    __host__
     explicit device_vector(size_type n)
       :Parent(n) {}
 
@@ -99,7 +92,6 @@ template<typename T, typename Alloc = thrust::device_allocator<T> >
      *  \param n The number of elements to initially create.
      *  \param alloc The allocator to use by this device_vector.
      */
-    __host__
     explicit device_vector(size_type n, const Alloc &alloc)
       :Parent(n,alloc) {}
 
@@ -108,7 +100,6 @@ template<typename T, typename Alloc = thrust::device_allocator<T> >
      *  \param n The number of elements to initially create.
      *  \param value An element to copy.
      */
-    __host__
     explicit device_vector(size_type n, const value_type &value)
       :Parent(n,value) {}
 
@@ -118,14 +109,12 @@ template<typename T, typename Alloc = thrust::device_allocator<T> >
      *  \param value An element to copy.
      *  \param alloc The allocator to use by this device_vector.
      */
-    __host__
     explicit device_vector(size_type n, const value_type &value, const Alloc &alloc)
       :Parent(n,value,alloc) {}
 
     /*! Copy constructor copies from an exemplar \p device_vector.
      *  \param v The \p device_vector to copy.
      */
-    __host__
     device_vector(const device_vector &v)
       :Parent(v) {}
 
@@ -133,7 +122,6 @@ template<typename T, typename Alloc = thrust::device_allocator<T> >
      *  \param v The \p device_vector to copy.
      *  \param alloc The allocator to use by this device_vector.
      */
-    __host__
     device_vector(const device_vector &v, const Alloc &alloc)
       :Parent(v,alloc) {}
 
@@ -141,7 +129,6 @@ template<typename T, typename Alloc = thrust::device_allocator<T> >
     /*! Move constructor moves from another \p device_vector.
      *  \param v The device_vector to move.
      */
-    __host__
     device_vector(device_vector &&v)
       :Parent(std::move(v)) {}
 
@@ -149,7 +136,6 @@ template<typename T, typename Alloc = thrust::device_allocator<T> >
      *  \param v The device_vector to move.
      *  \param alloc The allocator to use by this device_vector.
      */
-    __host__
     device_vector(device_vector &&v, const Alloc &alloc)
       :Parent(std::move(v), alloc) {}
   #endif // THRUST_CPP_DIALECT >= 2011
@@ -157,7 +143,6 @@ template<typename T, typename Alloc = thrust::device_allocator<T> >
     /*! Copy assign operator copies another \p device_vector with the same type.
      *  \param v The \p device_vector to copy.
      */
-    __host__
     device_vector &operator=(const device_vector &v)
     { Parent::operator=(v); return *this; }
 
@@ -165,7 +150,6 @@ template<typename T, typename Alloc = thrust::device_allocator<T> >
     /*! Move assign operator moves from another \p device_vector.
      *  \param v The device_vector to move.
      */
-     __host__
      device_vector &operator=(device_vector &&v)
      { Parent::operator=(std::move(v)); return *this; }
   #endif // THRUST_CPP_DIALECT >= 2011
@@ -174,16 +158,13 @@ template<typename T, typename Alloc = thrust::device_allocator<T> >
      *  \param v The \p device_vector to copy.
      */
     template<typename OtherT, typename OtherAlloc>
-    __host__ explicit
-    __device__
-    device_vector(const device_vector<OtherT,OtherAlloc> &v)
+    explicit device_vector(const device_vector<OtherT,OtherAlloc> &v)
       :Parent(v) {}
 
     /*! Assign operator copies from an exemplar \p device_vector with different type.
      *  \param v The \p device_vector to copy.
      */
     template<typename OtherT, typename OtherAlloc>
-    __host__
     device_vector &operator=(const device_vector<OtherT,OtherAlloc> &v)
     { Parent::operator=(v); return *this; }
 
@@ -191,7 +172,6 @@ template<typename T, typename Alloc = thrust::device_allocator<T> >
      *  \param v The <tt>std::vector</tt> to copy.
      */
     template<typename OtherT, typename OtherAlloc>
-    __host__
     device_vector(const std::vector<OtherT,OtherAlloc> &v)
       :Parent(v) {}
 
@@ -199,31 +179,49 @@ template<typename T, typename Alloc = thrust::device_allocator<T> >
      *  \param v The <tt>std::vector</tt> to copy.
      */
     template<typename OtherT, typename OtherAlloc>
-    __host__
     device_vector &operator=(const std::vector<OtherT,OtherAlloc> &v)
     { Parent::operator=(v); return *this;}
 
-    /*! Copy constructor copies from an exemplar \p host_vector with possibly different type.
-     *  \param v The \p host_vector to copy.
+    /*! Copy construct from a \p vector_base whose element type is convertible
+     *  to \c T.
+     *
+     *  \param v The \p vector_base to copy.
      */
     template<typename OtherT, typename OtherAlloc>
-    __host__
-    device_vector(const host_vector<OtherT,OtherAlloc> &v);
+    device_vector(const detail::vector_base<OtherT,OtherAlloc> &v)
+      :Parent(v) {}
 
-    /*! Assign operator copies from an examplar \p host_vector.
-     *  \param v The \p host_vector to copy.
+    /*! Assign a \p vector_base whose element type is convertible to \c T.
+     *  \param v The \p vector_base to copy.
      */
     template<typename OtherT, typename OtherAlloc>
-    __host__
-    device_vector &operator=(const host_vector<OtherT,OtherAlloc> &v)
+    device_vector &operator=(const detail::vector_base<OtherT,OtherAlloc> &v)
     { Parent::operator=(v); return *this; }
 
+    /*! This constructor builds a \p device_vector from an intializer_list.
+     *  \param il The intializer_list.
+     */
+    device_vector(std::initializer_list<T> il)
+      :Parent(il) {}
+      
+    /*! This constructor builds a \p device_vector from an intializer_list.
+     *  \param il The intializer_list.
+     *  \param alloc The allocator to use by this device_vector.
+     */
+    device_vector(std::initializer_list<T> il, const Alloc &alloc)
+      :Parent(il, alloc) {}
+      
+    /*! Assign an \p intializer_list with a matching element type
+     *  \param il The intializer_list.
+     */
+    device_vector &operator=(std::initializer_list<T> il)
+    { Parent::operator=(il); return *this; }
+
     /*! This constructor builds a \p device_vector from a range.
      *  \param first The beginning of the range.
      *  \param last The end of the range.
      */
     template<typename InputIterator>
-    __host__
     device_vector(InputIterator first, InputIterator last)
       :Parent(first,last) {}
 
@@ -233,7 +231,6 @@ template<typename T, typename Alloc = thrust::device_allocator<T> >
      *  \param alloc The allocator to use by this device_vector.
      */
     template<typename InputIterator>
-    __host__
     device_vector(InputIterator first, InputIterator last, const Alloc &alloc)
       :Parent(first,last,alloc) {}
 
@@ -453,7 +450,7 @@ template<typename T, typename Alloc = thrust::device_allocator<T> >
      *  \param x The exemplar element to copy & insert.
      *  \return An iterator pointing to the newly inserted element.
      */
-    iterator insert(iterator position, const T &x); 
+    iterator insert(iterator position, const T &x);
 
     /*! This method inserts a copy of an exemplar value to a range at the
      *  specified position in this vector.
@@ -469,8 +466,8 @@ template<typename T, typename Alloc = thrust::device_allocator<T> >
      *  \param first The beginning of the range to copy.
      *  \param last  The end of the range to copy.
      *
-     *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html>Input Iterator</a>,
-     *                        and \p InputIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>.
+     *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator>Input Iterator</a>,
+     *                        and \p InputIterator's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>.
      */
     template<typename InputIterator>
     void insert(iterator position, InputIterator first, InputIterator last);
@@ -486,7 +483,7 @@ template<typename T, typename Alloc = thrust::device_allocator<T> >
      *  \param first The beginning of the range to copy.
      *  \param last  The end of the range to copy.
      *
-     *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator">Input Iterator</a>.
+     *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/named_req/InputIterator">Input Iterator</a>.
      */
     template<typename InputIterator>
     void assign(InputIterator first, InputIterator last);
@@ -496,7 +493,7 @@ template<typename T, typename Alloc = thrust::device_allocator<T> >
      */
     allocator_type get_allocator(void) const;
 #endif // end doxygen-only members
-}; // end device_vector
+};
 
 /*! Exchanges the values of two vectors.
  *  \p x The first \p device_vector of interest.
@@ -506,13 +503,9 @@ template<typename T, typename Alloc>
   void swap(device_vector<T,Alloc> &a, device_vector<T,Alloc> &b)
 {
   a.swap(b);
-} // end swap()
+}
 
-/*! \}
+/*! \} // containres
  */
 
-} // end thrust
-
-#include <thrust/detail/device_vector.inl>
-
-
+THRUST_NAMESPACE_END
diff --git a/thrust/distance.h b/thrust/distance.h
index 6dd4800be..890879115 100644
--- a/thrust/distance.h
+++ b/thrust/distance.h
@@ -24,9 +24,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/iterator/iterator_traits.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup iterators
  *  \{
@@ -40,7 +38,7 @@ namespace thrust
  *  \param last The end of an input range of interest.
  *  \return The distance between the beginning and end of the input range.
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>.
  *
  *  \pre If \c InputIterator meets the requirements of random access iterator, \p last shall be reachable from \p first or
  *       \p first shall be reachable from \p last; otherwise, \p last shall be reachable from \p first.
@@ -61,7 +59,7 @@ namespace thrust
  *  // d is 7
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/distance.html
+ *  \see https://en.cppreference.com/w/cpp/iterator/distance
  */
 template<typename InputIterator>
 inline __host__ __device__
@@ -71,7 +69,6 @@ inline __host__ __device__
 /*! \} // end iterators
  */
 
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/distance.inl>
-
diff --git a/thrust/equal.h b/thrust/equal.h
index bc6db5015..2f3518907 100644
--- a/thrust/equal.h
+++ b/thrust/equal.h
@@ -24,9 +24,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/detail/execution_policy.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup reductions
  *  \{
@@ -52,11 +50,11 @@ namespace thrust
  *  \return \c true, if the sequences are equal; \c false, otherwise.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *          and \p InputIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/concepts/equality_comparable">Equality Comparable</a>,
  *          and \p InputIterator1's \c value_type can be compared for equality with \c InputIterator2's \c value_type.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *          and \p InputIterator2's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/concepts/equality_comparable">Equality Comparable</a>,
  *          and \p InputIterator2's \c value_type can be compared for equality with \c InputIterator1's \c value_type.
  *
  *  The following code snippet demonstrates how to use \p equal to test
@@ -74,7 +72,7 @@ namespace thrust
  *  // result == false
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/equal.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/equal
  */
 template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2>
 __host__ __device__
@@ -93,11 +91,11 @@ bool equal(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, Inp
  *  \param first2 The beginning of the second sequence.
  *  \return \c true, if the sequences are equal; \c false, otherwise.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *          and \p InputIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/concepts/equality_comparable">Equality Comparable</a>,
  *          and \p InputIterator1's \c value_type can be compared for equality with \c InputIterator2's \c value_type.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *          and \p InputIterator2's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/concepts/equality_comparable">Equality Comparable</a>,
  *          and \p InputIterator2's \c value_type can be compared for equality with \c InputIterator1's \c value_type.
  *
  *  The following code snippet demonstrates how to use \p equal to test
@@ -114,7 +112,7 @@ bool equal(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, Inp
  *  // result == false
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/equal.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/equal
  */
 template <typename InputIterator1, typename InputIterator2>
 bool equal(InputIterator1 first1, InputIterator1 last1,
@@ -139,11 +137,11 @@ bool equal(InputIterator1 first1, InputIterator1 last1,
  *  \return \c true, if the sequences are equal; \c false, otherwise.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator1's \c value_type is convertible to \p BinaryPredicate's \c first_argument_type.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator2's \c value_type is convertible to \p BinaryPredicate's \c second_argument_type.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *  \tparam BinaryPredicate is a model of <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">Binary Predicate</a>.
  *
  *  The following code snippet demonstrates how to use \p equal to compare the
  *  elements in two ranges modulo 2 using the \p thrust::host execution policy.
@@ -170,7 +168,7 @@ bool equal(InputIterator1 first1, InputIterator1 last1,
  *  // result is false
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/equal.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/equal
  */
 template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename BinaryPredicate>
 __host__ __device__
@@ -191,11 +189,11 @@ bool equal(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, Inp
  *  \param binary_pred Binary predicate used to test element equality.
  *  \return \c true, if the sequences are equal; \c false, otherwise.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator1's \c value_type is convertible to \p BinaryPredicate's \c first_argument_type.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator2's \c value_type is convertible to \p BinaryPredicate's \c second_argument_type.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *  \tparam BinaryPredicate is a model of <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">Binary Predicate</a>.
  *
  *  The following code snippet demonstrates how to use \p equal to compare the
  *  elements in two ranges modulo 2.
@@ -220,7 +218,7 @@ bool equal(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, Inp
  *  // result is true
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/equal.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/equal
  */
 template <typename InputIterator1, typename InputIterator2, 
           typename BinaryPredicate>
@@ -232,7 +230,6 @@ bool equal(InputIterator1 first1, InputIterator1 last1,
  *  \} // end reductions
  */
 
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/equal.inl>
-
diff --git a/thrust/execution_policy.h b/thrust/execution_policy.h
index ef1a5d853..ecf14413f 100644
--- a/thrust/execution_policy.h
+++ b/thrust/execution_policy.h
@@ -39,9 +39,7 @@
 
 //! \endcond
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 /*! \cond
  */
@@ -284,10 +282,9 @@ template<typename DerivedPolicy>
  *    }
  *  };
  *  ...
- *  int vec(3);
- *  vec[0] = 0; vec[1] = 1; vec[2] = 2;
+ *  int vec[] = { 0, 1, 2 };
  *
- *  thrust::for_each(thrust::host, vec.begin(), vec.end(), printf_functor());
+ *  thrust::for_each(thrust::host, vec, vec + 3, printf_functor());
  *
  *  // 0 1 2 is printed to standard output in some unspecified order
  *  \endcode
@@ -344,11 +341,7 @@ static const detail::host_t host;
  *  \see host_execution_policy
  *  \see thrust::device
  */
-#ifdef __CUDA_ARCH__
-static const __device__ detail::device_t device;
-#else
-static const detail::device_t device;
-#endif
+THRUST_INLINE_CONSTANT detail::device_t device;
 
 
 // define seq for the purpose of Doxygenating it
@@ -396,5 +389,4 @@ static const detail::seq_t seq;
  */
 
 
-} // end thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/extrema.h b/thrust/extrema.h
index c9fd016cc..ca419a0aa 100644
--- a/thrust/extrema.h
+++ b/thrust/extrema.h
@@ -24,9 +24,7 @@
 #include <thrust/detail/execution_policy.h>
 #include <thrust/pair.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 /*! This version of \p min returns the smaller of two values, given a comparison operation.
  *  \param lhs The first value to compare.
@@ -35,7 +33,7 @@ namespace thrust
  *  \return The smaller element.
  *
  *  \tparam T is convertible to \p BinaryPredicate's first argument type and to its second argument type.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">BinaryPredicate</a>.
+ *  \tparam BinaryPredicate is a model of <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">BinaryPredicate</a>.
  *
  *  The following code snippet demonstrates how to use \p min to compute the smaller of two
  *  key-value objects.
@@ -80,7 +78,7 @@ __host__ __device__
  *  \param rhs The second value to compare.
  *  \return The smaller element.
  *
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>.
  *
  *  The following code snippet demonstrates how to use \p min to compute the smaller of two
  *  integers.
@@ -111,7 +109,7 @@ __host__ __device__
  *  \return The larger element.
  *
  *  \tparam T is convertible to \p BinaryPredicate's first argument type and to its second argument type.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">BinaryPredicate</a>.
+ *  \tparam BinaryPredicate is a model of <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">BinaryPredicate</a>.
  *
  *  The following code snippet demonstrates how to use \p max to compute the larger of two
  *  key-value objects.
@@ -156,7 +154,7 @@ __host__ __device__
  *  \param rhs The second value to compare.
  *  \return The larger element.
  *
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>.
  *
  *  The following code snippet demonstrates how to use \p max to compute the larger of two
  *  integers.
@@ -207,9 +205,9 @@ __host__ __device__
  *  \return An iterator pointing to the smallest element of the range <tt>[first, last)</tt>,
  *          if it is not an empty range; \p last, otherwise.
  *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \c ForwardIterator's \c value_type is a model of
- *          <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *          <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>.
  *
  *  \code
  *  #include <thrust/extrema.h>
@@ -222,7 +220,7 @@ __host__ __device__
  *  // *result is 0
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/min_element.html 
+ *  \see https://en.cppreference.com/w/cpp/algorithm/min_element 
  */
 template<typename DerivedPolicy, typename ForwardIterator>
 __host__ __device__
@@ -246,9 +244,9 @@ ForwardIterator min_element(const thrust::detail::execution_policy_base<DerivedP
  *  \return An iterator pointing to the smallest element of the range <tt>[first, last)</tt>,
  *          if it is not an empty range; \p last, otherwise.
  *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \c ForwardIterator's \c value_type is a model of
- *          <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *          <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>.
  *
  *  \code
  *  #include <thrust/extrema.h>
@@ -260,7 +258,7 @@ ForwardIterator min_element(const thrust::detail::execution_policy_base<DerivedP
  *  // *result is 0
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/min_element.html 
+ *  \see https://en.cppreference.com/w/cpp/algorithm/min_element 
  */
 template <typename ForwardIterator>
 ForwardIterator min_element(ForwardIterator first, ForwardIterator last);
@@ -288,10 +286,10 @@ ForwardIterator min_element(ForwardIterator first, ForwardIterator last);
  *          if it is not an empty range; \p last, otherwise.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator's \c value_type is convertible to both \p comp's
  *          \c first_argument_type and \c second_argument_type.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate">Binary Predicate</a>.
+ *  \tparam BinaryPredicate is a model of <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">Binary Predicate</a>.
  *
  *  The following code snippet demonstrates how to use \p min_element to find the smallest element
  *  of a collection of key-value pairs using the \p thrust::host execution policy for parallelization:
@@ -325,7 +323,7 @@ ForwardIterator min_element(ForwardIterator first, ForwardIterator last);
  *  // *smallest == {0,7}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/min_element.html 
+ *  \see https://en.cppreference.com/w/cpp/algorithm/min_element 
  */
 template<typename DerivedPolicy, typename ForwardIterator, typename BinaryPredicate>
 __host__ __device__
@@ -350,10 +348,10 @@ ForwardIterator min_element(const thrust::detail::execution_policy_base<DerivedP
  *  \return An iterator pointing to the smallest element of the range <tt>[first, last)</tt>,
  *          if it is not an empty range; \p last, otherwise.
  *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator's \c value_type is convertible to both \p comp's
  *          \c first_argument_type and \c second_argument_type.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate">Binary Predicate</a>.
+ *  \tparam BinaryPredicate is a model of <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">Binary Predicate</a>.
  *
  *  The following code snippet demonstrates how to use \p min_element to find the smallest element
  *  of a collection of key-value pairs.
@@ -385,7 +383,7 @@ ForwardIterator min_element(const thrust::detail::execution_policy_base<DerivedP
  *  // *smallest == {0,7}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/min_element.html 
+ *  \see https://en.cppreference.com/w/cpp/algorithm/min_element 
  */
 template <typename ForwardIterator, typename BinaryPredicate>
 ForwardIterator min_element(ForwardIterator first, ForwardIterator last,
@@ -413,9 +411,9 @@ ForwardIterator min_element(ForwardIterator first, ForwardIterator last,
  *          if it is not an empty range; \p last, otherwise.
  *
  *  \tparam A Thrust backend system.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \c ForwardIterator's \c value_type is a model of
- *          <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *          <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>.
  *
  *  \code
  *  #include <thrust/extrema.h>
@@ -427,7 +425,7 @@ ForwardIterator min_element(ForwardIterator first, ForwardIterator last,
  *  // *result == 3
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/max_element.html 
+ *  \see https://en.cppreference.com/w/cpp/algorithm/max_element 
  */
 template<typename DerivedPolicy, typename ForwardIterator>
 __host__ __device__
@@ -451,9 +449,9 @@ ForwardIterator max_element(const thrust::detail::execution_policy_base<DerivedP
  *  \return An iterator pointing to the largest element of the range <tt>[first, last)</tt>,
  *          if it is not an empty range; \p last, otherwise.
  *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \c ForwardIterator's \c value_type is a model of
- *          <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *          <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>.
  *
  *  \code
  *  #include <thrust/extrema.h>
@@ -464,7 +462,7 @@ ForwardIterator max_element(const thrust::detail::execution_policy_base<DerivedP
  *  // *result == 3
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/max_element.html 
+ *  \see https://en.cppreference.com/w/cpp/algorithm/max_element 
  */
 template <typename ForwardIterator>
 ForwardIterator max_element(ForwardIterator first, ForwardIterator last);
@@ -492,10 +490,10 @@ ForwardIterator max_element(ForwardIterator first, ForwardIterator last);
  *          if it is not an empty range; \p last, otherwise.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator's \c value_type is convertible to both \p comp's
  *          \c first_argument_type and \c second_argument_type.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *  \tparam BinaryPredicate is a model of <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">Binary Predicate</a>.
  *
  *  The following code snippet demonstrates how to use \p max_element to find the largest element
  *  of a collection of key-value pairs using the \p thrust::host execution policy for parallelization.
@@ -529,7 +527,7 @@ ForwardIterator max_element(ForwardIterator first, ForwardIterator last);
  *  // *largest == {6,1}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/max_element.html 
+ *  \see https://en.cppreference.com/w/cpp/algorithm/max_element 
  */
 template<typename DerivedPolicy, typename ForwardIterator, typename BinaryPredicate>
 __host__ __device__
@@ -554,10 +552,10 @@ ForwardIterator max_element(const thrust::detail::execution_policy_base<DerivedP
  *  \return An iterator pointing to the largest element of the range <tt>[first, last)</tt>,
  *          if it is not an empty range; \p last, otherwise.
  *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator's \c value_type is convertible to both \p comp's
  *          \c first_argument_type and \c second_argument_type.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *  \tparam BinaryPredicate is a model of <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">Binary Predicate</a>.
  *
  *  The following code snippet demonstrates how to use \p max_element to find the largest element
  *  of a collection of key-value pairs.
@@ -589,7 +587,7 @@ ForwardIterator max_element(const thrust::detail::execution_policy_base<DerivedP
  *  // *largest == {6,1}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/max_element.html 
+ *  \see https://en.cppreference.com/w/cpp/algorithm/max_element 
  */
 template <typename ForwardIterator, typename BinaryPredicate>
 ForwardIterator max_element(ForwardIterator first, ForwardIterator last,
@@ -610,9 +608,9 @@ ForwardIterator max_element(ForwardIterator first, ForwardIterator last,
  *          if it is not an empty range; \p last, otherwise.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \c ForwardIterator's \c value_type is a model of
- *          <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *          <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>.
  *
  *  \code
  *  #include <thrust/extrema.h>
@@ -646,9 +644,9 @@ thrust::pair<ForwardIterator,ForwardIterator> minmax_element(const thrust::detai
  *  \return A pair of iterator pointing to the smallest and largest elements of the range <tt>[first, last)</tt>,
  *          if it is not an empty range; \p last, otherwise.
  *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \c ForwardIterator's \c value_type is a model of
- *          <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *          <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>.
  *
  *  \code
  *  #include <thrust/extrema.h>
@@ -686,10 +684,10 @@ thrust::pair<ForwardIterator,ForwardIterator> minmax_element(ForwardIterator fir
  *          if it is not an empty range; \p last, otherwise.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator's \c value_type is convertible to both \p comp's
  *          \c first_argument_type and \c second_argument_type.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate">Binary Predicate</a>.
+ *  \tparam BinaryPredicate is a model of <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">Binary Predicate</a>.
  *
  *  The following code snippet demonstrates how to use \p minmax_element to find the smallest and largest elements
  *  of a collection of key-value pairs using the \p thrust::host execution policy for parallelization:
@@ -746,10 +744,10 @@ thrust::pair<ForwardIterator,ForwardIterator> minmax_element(const thrust::detai
  *  \return A pair of iterator pointing to the smallest and largest elements of the range <tt>[first, last)</tt>,
  *          if it is not an empty range; \p last, otherwise.
  *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator's \c value_type is convertible to both \p comp's
  *          \c first_argument_type and \c second_argument_type.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate">Binary Predicate</a>.
+ *  \tparam BinaryPredicate is a model of <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">Binary Predicate</a>.
  *
  *  The following code snippet demonstrates how to use \p minmax_element to find the smallest and largest elements
  *  of a collection of key-value pairs.
@@ -797,8 +795,7 @@ thrust::pair<ForwardIterator,ForwardIterator> minmax_element(ForwardIterator fir
  *  \} // end reductions
  */
 
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/extrema.inl>
 #include <thrust/detail/minmax.h>
-
diff --git a/thrust/fill.h b/thrust/fill.h
index 850313802..bd9e40268 100644
--- a/thrust/fill.h
+++ b/thrust/fill.h
@@ -24,9 +24,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/detail/execution_policy.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup transformations
  *  \addtogroup filling
@@ -48,9 +46,9 @@ namespace thrust
  *  \param value The value to be copied.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator is mutable.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>,
  *          and \p T's \c value_type is convertible to \p ForwardIterator's \c value_type.
  *
  *  The following code snippet demonstrates how to use \p fill to set a thrust::device_vector's
@@ -67,7 +65,7 @@ namespace thrust
  *  // v[0] == 137, v[1] == 137, v[2] == 137, v[3] == 137
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/fill.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/fill
  *  \see \c fill_n
  *  \see \c uninitialized_fill
  */
@@ -88,9 +86,9 @@ __host__ __device__
  *  \param last The end of the sequence.
  *  \param value The value to be copied.
  *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator is mutable.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>,
  *          and \p T's \c value_type is convertible to \p ForwardIterator's \c value_type.
  *
  *  The following code snippet demonstrates how to use \p fill to set a thrust::device_vector's
@@ -106,7 +104,7 @@ __host__ __device__
  *  // v[0] == 137, v[1] == 137, v[2] == 137, v[3] == 137
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/fill.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/fill
  *  \see \c fill_n
  *  \see \c uninitialized_fill
  */
@@ -131,8 +129,8 @@ __host__ __device__
  *  \return <tt>first + n</tt>
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>,
  *          and \p T's \c value_type is convertible to a type in \p OutputIterator's set of \c value_type.
  *
  *  The following code snippet demonstrates how to use \p fill to set a thrust::device_vector's
@@ -149,7 +147,7 @@ __host__ __device__
  *  // v[0] == 137, v[1] == 137, v[2] == 137, v[3] == 137
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/fill_n.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/fill_n
  *  \see \c fill
  *  \see \c uninitialized_fill_n
  */
@@ -171,8 +169,8 @@ __host__ __device__
  *  \param value The value to be copied.
  *  \return <tt>first + n</tt>
  *
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>,
  *          and \p T's \c value_type is convertible to a type in \p OutputIterator's set of \c value_type.
  *
  *  The following code snippet demonstrates how to use \p fill to set a thrust::device_vector's
@@ -188,7 +186,7 @@ __host__ __device__
  *  // v[0] == 137, v[1] == 137, v[2] == 137, v[3] == 137
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/fill_n.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/fill_n
  *  \see \c fill
  *  \see \c uninitialized_fill_n
  */
@@ -203,7 +201,6 @@ __host__ __device__
  *  \} // transformations
  */
 
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/fill.inl>
-
diff --git a/thrust/find.h b/thrust/find.h
index 6e992499e..5ab9b0a2d 100644
--- a/thrust/find.h
+++ b/thrust/find.h
@@ -24,9 +24,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/detail/execution_policy.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup algorithms
  */
@@ -50,9 +48,9 @@ namespace thrust
  *  \return The first iterator \c i such that <tt>*i == value</tt> or \c last.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *          and \p InputIterator's \c value_type is equality comparable to type \c T.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">EqualityComparable</a>. 
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">EqualityComparable</a>. 
  *
  *  \code
  *  #include <thrust/find.h>
@@ -93,9 +91,9 @@ InputIterator find(const thrust::detail::execution_policy_base<DerivedPolicy> &e
  *  \param value The value to find.
  *  \return The first iterator \c i such that <tt>*i == value</tt> or \c last.
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *          and \p InputIterator's \c value_type is equality comparable to type \c T.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">EqualityComparable</a>. 
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">EqualityComparable</a>. 
  *
  *  \code
  *  #include <thrust/find.h>
@@ -137,8 +135,8 @@ InputIterator find(InputIterator first,
  *  \return The first iterator \c i such that <tt>pred(*i)</tt> is \c true, or \c last.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \code
  *  #include <thrust/find.h>
@@ -200,8 +198,8 @@ InputIterator find_if(const thrust::detail::execution_policy_base<DerivedPolicy>
  *  \param pred A predicate used to test range elements.
  *  \return The first iterator \c i such that <tt>pred(*i)</tt> is \c true, or \c last.
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \code
  *  #include <thrust/find.h>
@@ -263,8 +261,8 @@ InputIterator find_if(InputIterator first,
  *  \return The first iterator \c i such that <tt>pred(*i)</tt> is \c false, or \c last.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \code
  *  #include <thrust/find.h>
@@ -326,8 +324,8 @@ InputIterator find_if_not(const thrust::detail::execution_policy_base<DerivedPol
  *  \param pred A predicate used to test range elements.
  *  \return The first iterator \c i such that <tt>pred(*i)</tt> is \c false, or \c last.
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \code
  *  #include <thrust/find.h>
@@ -378,8 +376,6 @@ InputIterator find_if_not(InputIterator first,
 /*! \} // end searching
  */
 
-
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/find.inl>
-
diff --git a/thrust/for_each.h b/thrust/for_each.h
index dcc87f399..7d05e3ea1 100644
--- a/thrust/for_each.h
+++ b/thrust/for_each.h
@@ -24,9 +24,7 @@
 #include <thrust/detail/type_traits.h>
 #include <thrust/detail/execution_policy.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup modifying
  *  \ingroup transformations
@@ -50,13 +48,13 @@ namespace thrust
  *  \return last
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator">Input Iterator</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/named_req/InputIterator">Input Iterator</a>,
  *          and \p InputIterator's \c value_type is convertible to \p UnaryFunction's \c argument_type.
- *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction">Unary Function</a>,
+ *  \tparam UnaryFunction is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/unary_function">Unary Function</a>,
  *          and \p UnaryFunction does not apply any non-constant operation through its argument.
  *
  *  The following code snippet demonstrates how to use \p for_each to print the elements
- *  of a \p std::device_vector using the \p thrust::device parallelization policy:
+ *  of a \p thrust::device_vector using the \p thrust::device parallelization policy:
  *
  *  \code
  *  #include <thrust/for_each.h>
@@ -86,7 +84,7 @@ namespace thrust
  *  \endcode
  *
  *  \see for_each_n
- *  \see http://www.sgi.com/tech/stl/for_each.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/for_each
  */
 template<typename DerivedPolicy,
          typename InputIterator,
@@ -113,10 +111,10 @@ InputIterator for_each(const thrust::detail::execution_policy_base<DerivedPolicy
  *  \return <tt>first + n</tt>
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator">Input Iterator</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/named_req/InputIterator">Input Iterator</a>,
  *          and \p InputIterator's \c value_type is convertible to \p UnaryFunction's \c argument_type.
  *  \tparam Size is an integral type.
- *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction">Unary Function</a>,
+ *  \tparam UnaryFunction is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/unary_function">Unary Function</a>,
  *          and \p UnaryFunction does not apply any non-constant operation through its argument.
  *
  *  The following code snippet demonstrates how to use \p for_each_n to print the elements
@@ -149,7 +147,7 @@ InputIterator for_each(const thrust::detail::execution_policy_base<DerivedPolicy
  *  \endcode
  *
  *  \see for_each
- *  \see http://www.sgi.com/tech/stl/for_each.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/for_each
  */
 template<typename DerivedPolicy,
          typename InputIterator,
@@ -173,9 +171,9 @@ InputIterator for_each_n(const thrust::detail::execution_policy_base<DerivedPoli
  *  \param f The function object to apply to the range <tt>[first, last)</tt>.
  *  \return last
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator">Input Iterator</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/named_req/InputIterator">Input Iterator</a>,
  *          and \p InputIterator's \c value_type is convertible to \p UnaryFunction's \c argument_type.
- *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction">Unary Function</a>,
+ *  \tparam UnaryFunction is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/unary_function">Unary Function</a>,
  *          and \p UnaryFunction does not apply any non-constant operation through its argument.
  *
  *  The following code snippet demonstrates how to use \p for_each to print the elements
@@ -207,7 +205,7 @@ InputIterator for_each_n(const thrust::detail::execution_policy_base<DerivedPoli
  *  \endcode
  *
  *  \see for_each_n
- *  \see http://www.sgi.com/tech/stl/for_each.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/for_each
  */
 template<typename InputIterator,
          typename UnaryFunction>
@@ -227,10 +225,10 @@ InputIterator for_each(InputIterator first,
  *  \param f The function object to apply to the range <tt>[first, first + n)</tt>.
  *  \return <tt>first + n</tt>
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator">Input Iterator</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/named_req/InputIterator">Input Iterator</a>,
  *          and \p InputIterator's \c value_type is convertible to \p UnaryFunction's \c argument_type.
  *  \tparam Size is an integral type.
- *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction">Unary Function</a>,
+ *  \tparam UnaryFunction is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/unary_function">Unary Function</a>,
  *          and \p UnaryFunction does not apply any non-constant operation through its argument.
  *
  *  The following code snippet demonstrates how to use \p for_each_n to print the elements
@@ -262,7 +260,7 @@ InputIterator for_each(InputIterator first,
  *  \endcode
  *
  *  \see for_each
- *  \see http://www.sgi.com/tech/stl/for_each.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/for_each
  */
 template<typename InputIterator,
          typename Size,
@@ -274,7 +272,7 @@ InputIterator for_each_n(InputIterator first,
 /*! \} // end modifying
  */
 
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/for_each.inl>
 
diff --git a/thrust/functional.h b/thrust/functional.h
index ec8c62104..0608f4b3d 100644
--- a/thrust/functional.h
+++ b/thrust/functional.h
@@ -25,8 +25,7 @@
 #include <functional>
 #include <thrust/detail/functional/placeholder.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup function_objects Function Objects
  */
@@ -47,7 +46,7 @@ template<typename Operation> struct binary_traits;
  *  Unary Function must define nested \c typedefs. Those \c typedefs are
  *  provided by the base class \p unary_function.
  *
- *  The following code snippet demonstrates how to construct an 
+ *  The following code snippet demonstrates how to construct an
  *  Adaptable Unary Function using \p unary_function.
  *
  *  \code
@@ -62,7 +61,7 @@ template<typename Operation> struct binary_traits;
  *        \c unary_function obsolete, its use is optional if C++11 language
  *        features are enabled.
  *
- *  \see http://www.sgi.com/tech/stl/unary_function.html
+ *  \see https://en.cppreference.com/w/cpp/utility/functional/unary_function
  *  \see binary_function
  */
 template<typename Argument,
@@ -87,7 +86,7 @@ struct unary_function
  *  Binary Function must define nested \c typedefs. Those \c typedefs are
  *  provided by the base class \p binary_function.
  *
- *  The following code snippet demonstrates how to construct an 
+ *  The following code snippet demonstrates how to construct an
  *  Adaptable Binary Function using \p binary_function.
  *
  *  \code
@@ -102,7 +101,7 @@ struct unary_function
  *        \c binary_function obsolete, its use is optional if C++11 language
  *        features are enabled.
  *
- *  \see http://www.sgi.com/tech/stl/binary_function.html
+ *  \see https://en.cppreference.com/w/cpp/utility/functional/binary_function
  *  \see unary_function
  */
 template<typename Argument1,
@@ -139,11 +138,46 @@ struct binary_function
  *  \{
  */
 
+#define THRUST_UNARY_FUNCTOR_VOID_SPECIALIZATION(func, impl)                   \
+  template <>                                                                  \
+  struct func<void>                                                            \
+  {                                                                            \
+    using is_transparent = void;                                               \
+    __thrust_exec_check_disable__                                              \
+    template <typename T>                                                      \
+    __host__ __device__                                                        \
+    constexpr auto operator()(T&& x) const                                     \
+      noexcept(noexcept(impl)) THRUST_TRAILING_RETURN(decltype(impl))          \
+    {                                                                          \
+      return impl;                                                             \
+    }                                                                          \
+  }
+
+#define THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION(func, impl)                  \
+  template <>                                                                  \
+  struct func<void>                                                            \
+  {                                                                            \
+    using is_transparent = void;                                               \
+    __thrust_exec_check_disable__                                              \
+    template <typename T1, typename T2>                                        \
+    __host__ __device__                                                        \
+    constexpr auto operator()(T1&& t1, T2&& t2) const                          \
+      noexcept(noexcept(impl)) THRUST_TRAILING_RETURN(decltype(impl))          \
+    {                                                                          \
+      return impl;                                                             \
+    }                                                                          \
+  }
+
+#define THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(func, op)                 \
+  THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION(                                   \
+    func, THRUST_FWD(t1) op THRUST_FWD(t2))
+
+
 /*! \p plus is a function object. Specifically, it is an Adaptable Binary Function.
  *  If \c f is an object of class <tt>plus<T></tt>, and \c x and \c y are objects
  *  of class \c T, then <tt>f(x,y)</tt> returns <tt>x+y</tt>.
  *
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>,
  *          and if \c x and \c y are objects of type \p T, then <tt>x+y</tt> must be defined and must have a return type that is convertible to \c T.
  *
  *  The following code snippet demonstrates how to use <tt>plus</tt> to sum two
@@ -169,10 +203,10 @@ struct binary_function
  *  // V3 is now {76, 77, 78, ..., 1075}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/plus.html
+ *  \see https://en.cppreference.com/w/cpp/utility/functional/plus
  *  \see binary_function
  */
-template<typename T>
+template<typename T = void>
 struct plus
 {
   /*! \typedef first_argument_type
@@ -193,14 +227,20 @@ struct plus
   /*! Function call operator. The return value is <tt>lhs + rhs</tt>.
    */
   __thrust_exec_check_disable__
-  __host__ __device__ T operator()(const T &lhs, const T &rhs) const {return lhs + rhs;}
+  __host__ __device__
+  constexpr T operator()(const T &lhs, const T &rhs) const
+  {
+    return lhs + rhs;
+  }
 }; // end plus
 
+THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(plus, +);
+
 /*! \p minus is a function object. Specifically, it is an Adaptable Binary Function.
  *  If \c f is an object of class <tt>minus<T></tt>, and \c x and \c y are objects
  *  of class \c T, then <tt>f(x,y)</tt> returns <tt>x-y</tt>.
  *
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>,
  *          and if \c x and \c y are objects of type \p T, then <tt>x-y</tt> must be defined and must have a return type that is convertible to \c T.
  *
  *  The following code snippet demonstrates how to use <tt>minus</tt> to subtract
@@ -226,10 +266,10 @@ struct plus
  *  // V3 is now {-74, -73, -72, ..., 925}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/minus.html
+ *  \see https://en.cppreference.com/w/cpp/utility/functional/minus
  *  \see binary_function
  */
-template<typename T>
+template<typename T = void>
 struct minus
 {
   /*! \typedef first_argument_type
@@ -250,14 +290,20 @@ struct minus
   /*! Function call operator. The return value is <tt>lhs - rhs</tt>.
    */
   __thrust_exec_check_disable__
-  __host__ __device__ T operator()(const T &lhs, const T &rhs) const {return lhs - rhs;}
+  __host__ __device__
+  constexpr T operator()(const T &lhs, const T &rhs) const
+  {
+    return lhs - rhs;
+  }
 }; // end minus
 
+THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(minus, -);
+
 /*! \p multiplies is a function object. Specifically, it is an Adaptable Binary Function.
  *  If \c f is an object of class <tt>multiplies<T></tt>, and \c x and \c y are objects
  *  of class \c T, then <tt>f(x,y)</tt> returns <tt>x*y</tt>.
  *
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>,
  *          and if \c x and \c y are objects of type \p T, then <tt>x*y</tt> must be defined and must have a return type that is convertible to \c T.
  *
  *  The following code snippet demonstrates how to use <tt>multiplies</tt> to multiply
@@ -283,10 +329,10 @@ struct minus
  *  // V3 is now {75, 150, 225, ..., 75000}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/multiplies.html
+ *  \see https://en.cppreference.com/w/cpp/utility/functional/multiplies
  *  \see binary_function
  */
-template<typename T>
+template<typename T = void>
 struct multiplies
 {
   /*! \typedef first_argument_type
@@ -307,14 +353,20 @@ struct multiplies
   /*! Function call operator. The return value is <tt>lhs * rhs</tt>.
    */
   __thrust_exec_check_disable__
-  __host__ __device__ T operator()(const T &lhs, const T &rhs) const {return lhs * rhs;}
+  __host__ __device__
+  constexpr T operator()(const T &lhs, const T &rhs) const
+  {
+    return lhs * rhs;
+  }
 }; // end multiplies
 
+THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(multiplies, *);
+
 /*! \p divides is a function object. Specifically, it is an Adaptable Binary Function.
  *  If \c f is an object of class <tt>divides<T></tt>, and \c x and \c y are objects
  *  of class \c T, then <tt>f(x,y)</tt> returns <tt>x/y</tt>.
  *
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>,
  *          and if \c x and \c y are objects of type \p T, then <tt>x/y</tt> must be defined and must have a return type that is convertible to \c T.
  *
  *  The following code snippet demonstrates how to use <tt>divides</tt> to divide
@@ -340,10 +392,10 @@ struct multiplies
  *  // V3 is now {1/75, 2/75, 3/75, ..., 1000/75}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/divides.html
+ *  \see https://en.cppreference.com/w/cpp/utility/functional/divides
  *  \see binary_function
  */
-template<typename T>
+template<typename T = void>
 struct divides
 {
   /*! \typedef first_argument_type
@@ -364,14 +416,20 @@ struct divides
   /*! Function call operator. The return value is <tt>lhs / rhs</tt>.
    */
   __thrust_exec_check_disable__
-  __host__ __device__ T operator()(const T &lhs, const T &rhs) const {return lhs / rhs;}
+  __host__ __device__
+  constexpr T operator()(const T &lhs, const T &rhs) const
+  {
+    return lhs / rhs;
+  }
 }; // end divides
 
+THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(divides, /);
+
 /*! \p modulus is a function object. Specifically, it is an Adaptable Binary Function.
  *  If \c f is an object of class <tt>modulus<T></tt>, and \c x and \c y are objects
  *  of class \c T, then <tt>f(x,y)</tt> returns <tt>x \% y</tt>.
  *
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>,
  *          and if \c x and \c y are objects of type \p T, then <tt>x \% y</tt> must be defined and must have a return type that is convertible to \c T.
  *
  *  The following code snippet demonstrates how to use <tt>modulus</tt> to take
@@ -397,10 +455,10 @@ struct divides
  *  // V3 is now {1%75, 2%75, 3%75, ..., 1000%75}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/modulus.html
+ *  \see https://en.cppreference.com/w/cpp/utility/functional/modulus
  *  \see binary_function
  */
-template<typename T>
+template<typename T = void>
 struct modulus
 {
   /*! \typedef first_argument_type
@@ -421,14 +479,20 @@ struct modulus
   /*! Function call operator. The return value is <tt>lhs % rhs</tt>.
    */
   __thrust_exec_check_disable__
-  __host__ __device__ T operator()(const T &lhs, const T &rhs) const {return lhs % rhs;}
+  __host__ __device__
+  constexpr T operator()(const T &lhs, const T &rhs) const
+  {
+    return lhs % rhs;
+  }
 }; // end modulus
 
+THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(modulus, %);
+
 /*! \p negate is a function object. Specifically, it is an Adaptable Unary Function.
  *  If \c f is an object of class <tt>negate<T></tt>, and \c x is an object
  *  of class \c T, then <tt>f(x)</tt> returns <tt>-x</tt>.
  *
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>,
  *          and if \c x is an object of type \p T, then <tt>-x</tt> must be defined and must have a return type that is convertible to \c T.
  *
  *  The following code snippet demonstrates how to use <tt>negate</tt> to negate
@@ -451,10 +515,10 @@ struct modulus
  *  // V2 is now {-1, -2, -3, ..., -1000}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/negate.html
+ *  \see https://en.cppreference.com/w/cpp/utility/functional/negate
  *  \see unary_function
  */
-template<typename T>
+template<typename T = void>
 struct negate
 {
   /*! \typedef argument_type
@@ -470,14 +534,20 @@ struct negate
   /*! Function call operator. The return value is <tt>-x</tt>.
    */
   __thrust_exec_check_disable__
-  __host__ __device__ T operator()(const T &x) const {return -x;}
+  __host__ __device__
+  constexpr T operator()(const T &x) const
+  {
+    return -x;
+  }
 }; // end negate
 
+THRUST_UNARY_FUNCTOR_VOID_SPECIALIZATION(negate, -THRUST_FWD(x));
+
 /*! \p square is a function object. Specifically, it is an Adaptable Unary Function.
  *  If \c f is an object of class <tt>square<T></tt>, and \c x is an object
  *  of class \c T, then <tt>f(x)</tt> returns <tt>x*x</tt>.
  *
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>,
  *          and if \c x is an object of type \p T, then <tt>x*x</tt> must be defined and must have a return type that is convertible to \c T.
  *
  *  The following code snippet demonstrates how to use <tt>square</tt> to square
@@ -502,7 +572,7 @@ struct negate
  *
  *  \see unary_function
  */
-template<typename T>
+template<typename T = void>
 struct square
 {
   /*! \typedef argument_type
@@ -518,9 +588,15 @@ struct square
   /*! Function call operator. The return value is <tt>x*x</tt>.
    */
   __thrust_exec_check_disable__
-  __host__ __device__ T operator()(const T &x) const {return x*x;}
+  __host__ __device__
+  constexpr T operator()(const T &x) const
+  {
+    return x*x;
+  }
 }; // end square
 
+THRUST_UNARY_FUNCTOR_VOID_SPECIALIZATION(square, x*x);
+
 /*! \}
  */
 
@@ -535,12 +611,12 @@ struct square
  *  and \c y are objects of class \c T, then <tt>f(x,y)</tt> returns \c true if
  *  <tt>x == y</tt> and \c false otherwise.
  *
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/concepts/equality_comparable">Equality Comparable</a>.
  *
- *  \see http://www.sgi.com/tech/stl/equal_to.html
+ *  \see https://en.cppreference.com/w/cpp/utility/functional/equal_to
  *  \see binary_function
  */
-template<typename T>
+template<typename T = void>
 struct equal_to
 {
   /*! \typedef first_argument_type
@@ -561,21 +637,27 @@ struct equal_to
   /*! Function call operator. The return value is <tt>lhs == rhs</tt>.
    */
   __thrust_exec_check_disable__
-  __host__ __device__ bool operator()(const T &lhs, const T &rhs) const {return lhs == rhs;}
+  __host__ __device__
+  constexpr bool operator()(const T &lhs, const T &rhs) const
+  {
+    return lhs == rhs;
+  }
 }; // end equal_to
 
+THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(equal_to, ==);
+
 /*! \p not_equal_to is a function object. Specifically, it is an Adaptable Binary
  *  Predicate, which means it is a function object that tests the truth or falsehood
  *  of some condition. If \c f is an object of class <tt>not_equal_to<T></tt> and \c x
  *  and \c y are objects of class \c T, then <tt>f(x,y)</tt> returns \c true if
  *  <tt>x != y</tt> and \c false otherwise.
  *
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/concepts/equality_comparable">Equality Comparable</a>.
  *
- *  \see http://www.sgi.com/tech/stl/not_equal_to.html
+ *  \see https://en.cppreference.com/w/cpp/utility/functional/not_equal_to
  *  \see binary_function
  */
-template<typename T>
+template<typename T = void>
 struct not_equal_to
 {
   /*! \typedef first_argument_type
@@ -596,21 +678,27 @@ struct not_equal_to
   /*! Function call operator. The return value is <tt>lhs != rhs</tt>.
    */
   __thrust_exec_check_disable__
-  __host__ __device__ bool operator()(const T &lhs, const T &rhs) const {return lhs != rhs;}
+  __host__ __device__
+  constexpr bool operator()(const T &lhs, const T &rhs) const
+  {
+    return lhs != rhs;
+  }
 }; // end not_equal_to
 
+THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(not_equal_to, !=);
+
 /*! \p greater is a function object. Specifically, it is an Adaptable Binary
  *  Predicate, which means it is a function object that tests the truth or falsehood
  *  of some condition. If \c f is an object of class <tt>greater<T></tt> and \c x
  *  and \c y are objects of class \c T, then <tt>f(x,y)</tt> returns \c true if
  *  <tt>x > y</tt> and \c false otherwise.
  *
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>.
  *
- *  \see http://www.sgi.com/tech/stl/greater.html
+ *  \see https://en.cppreference.com/w/cpp/utility/functional/greater
  *  \see binary_function
  */
-template<typename T>
+template<typename T = void>
 struct greater
 {
   /*! \typedef first_argument_type
@@ -631,21 +719,27 @@ struct greater
   /*! Function call operator. The return value is <tt>lhs > rhs</tt>.
    */
   __thrust_exec_check_disable__
-  __host__ __device__ bool operator()(const T &lhs, const T &rhs) const {return lhs > rhs;}
+  __host__ __device__
+  constexpr bool operator()(const T &lhs, const T &rhs) const
+  {
+    return lhs > rhs;
+  }
 }; // end greater
 
+THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(greater, >);
+
 /*! \p less is a function object. Specifically, it is an Adaptable Binary
  *  Predicate, which means it is a function object that tests the truth or falsehood
  *  of some condition. If \c f is an object of class <tt>less<T></tt> and \c x
  *  and \c y are objects of class \c T, then <tt>f(x,y)</tt> returns \c true if
  *  <tt>x < y</tt> and \c false otherwise.
  *
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>.
  *
- *  \see http://www.sgi.com/tech/stl/less.html
+ *  \see https://en.cppreference.com/w/cpp/utility/functional/less
  *  \see binary_function
  */
-template<typename T>
+template<typename T = void>
 struct less
 {
   /*! \typedef first_argument_type
@@ -666,21 +760,27 @@ struct less
   /*! Function call operator. The return value is <tt>lhs < rhs</tt>.
    */
   __thrust_exec_check_disable__
-  __host__ __device__ bool operator()(const T &lhs, const T &rhs) const {return lhs < rhs;}
+  __host__ __device__
+  constexpr bool operator()(const T &lhs, const T &rhs) const
+  {
+    return lhs < rhs;
+  }
 }; // end less
 
+THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(less, <);
+
 /*! \p greater_equal is a function object. Specifically, it is an Adaptable Binary
  *  Predicate, which means it is a function object that tests the truth or falsehood
  *  of some condition. If \c f is an object of class <tt>greater_equal<T></tt> and \c x
  *  and \c y are objects of class \c T, then <tt>f(x,y)</tt> returns \c true if
  *  <tt>x >= y</tt> and \c false otherwise.
  *
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>.
  *
- *  \see http://www.sgi.com/tech/stl/greater_equal.html
+ *  \see https://en.cppreference.com/w/cpp/utility/functional/greater_equal
  *  \see binary_function
  */
-template<typename T>
+template<typename T = void>
 struct greater_equal
 {
   /*! \typedef first_argument_type
@@ -701,21 +801,27 @@ struct greater_equal
   /*! Function call operator. The return value is <tt>lhs >= rhs</tt>.
    */
   __thrust_exec_check_disable__
-  __host__ __device__ bool operator()(const T &lhs, const T &rhs) const {return lhs >= rhs;}
+  __host__ __device__
+  constexpr bool operator()(const T &lhs, const T &rhs) const
+  {
+    return lhs >= rhs;
+  }
 }; // end greater_equal
 
+THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(greater_equal, >=);
+
 /*! \p less_equal is a function object. Specifically, it is an Adaptable Binary
  *  Predicate, which means it is a function object that tests the truth or falsehood
  *  of some condition. If \c f is an object of class <tt>less_equal<T></tt> and \c x
  *  and \c y are objects of class \c T, then <tt>f(x,y)</tt> returns \c true if
  *  <tt>x <= y</tt> and \c false otherwise.
  *
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>.
  *
- *  \see http://www.sgi.com/tech/stl/less_equal.html
+ *  \see https://en.cppreference.com/w/cpp/utility/functional/less_equal
  *  \see binary_function
  */
-template<typename T>
+template<typename T = void>
 struct less_equal
 {
   /*! \typedef first_argument_type
@@ -736,9 +842,15 @@ struct less_equal
   /*! Function call operator. The return value is <tt>lhs <= rhs</tt>.
    */
   __thrust_exec_check_disable__
-  __host__ __device__ bool operator()(const T &lhs, const T &rhs) const {return lhs <= rhs;}
+  __host__ __device__
+  constexpr bool operator()(const T &lhs, const T &rhs) const
+  {
+    return lhs <= rhs;
+  }
 }; // end less_equal
 
+THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(less_equal, <=);
+
 /*! \}
  */
 
@@ -756,10 +868,10 @@ struct less_equal
  *
  *  \tparam T must be convertible to \c bool.
  *
- *  \see http://www.sgi.com/tech/stl/logical_and.html
+ *  \see https://en.cppreference.com/w/cpp/utility/functional/logical_and
  *  \see binary_function
  */
-template<typename T>
+template<typename T = void>
 struct logical_and
 {
   /*! \typedef first_argument_type
@@ -780,9 +892,15 @@ struct logical_and
   /*! Function call operator. The return value is <tt>lhs && rhs</tt>.
    */
   __thrust_exec_check_disable__
-  __host__ __device__ bool operator()(const T &lhs, const T &rhs) const {return lhs && rhs;}
+  __host__ __device__
+  constexpr bool operator()(const T &lhs, const T &rhs) const
+  {
+    return lhs && rhs;
+  }
 }; // end logical_and
 
+THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(logical_and, &&);
+
 /*! \p logical_or is a function object. Specifically, it is an Adaptable Binary Predicate,
  *  which means it is a function object that tests the truth or falsehood of some condition.
  *  If \c f is an object of class <tt>logical_or<T></tt> and \c x and \c y are objects of
@@ -791,10 +909,10 @@ struct logical_and
  *
  *  \tparam T must be convertible to \c bool.
  *
- *  \see http://www.sgi.com/tech/stl/logical_or.html
+ *  \see https://en.cppreference.com/w/cpp/utility/functional/logical_or
  *  \see binary_function
  */
-template<typename T>
+template<typename T = void>
 struct logical_or
 {
   /*! \typedef first_argument_type
@@ -815,9 +933,15 @@ struct logical_or
   /*! Function call operator. The return value is <tt>lhs || rhs</tt>.
    */
   __thrust_exec_check_disable__
-  __host__ __device__ bool operator()(const T &lhs, const T &rhs) const {return lhs || rhs;}
+  __host__ __device__
+  constexpr bool operator()(const T &lhs, const T &rhs) const
+  {
+    return lhs || rhs;
+  }
 }; // end logical_or
 
+THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(logical_or, ||);
+
 /*! \p logical_not is a function object. Specifically, it is an Adaptable Predicate,
  *  which means it is a function object that tests the truth or falsehood of some condition.
  *  If \c f is an object of class <tt>logical_not<T></tt> and \c x is an object of
@@ -840,10 +964,10 @@ struct logical_or
  *  // The elements of V are now the logical complement of what they were prior
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/logical_not.html
+ *  \see https://en.cppreference.com/w/cpp/utility/functional/logical_not
  *  \see unary_function
  */
-template<typename T>
+template<typename T = void>
 struct logical_not
 {
   /*! \typedef first_argument_type
@@ -864,9 +988,15 @@ struct logical_not
   /*! Function call operator. The return value is <tt>!x</tt>.
    */
   __thrust_exec_check_disable__
-  __host__ __device__ bool operator()(const T &x) const {return !x;}
+  __host__ __device__
+  constexpr bool operator()(const T &x) const
+  {
+    return !x;
+  }
 }; // end logical_not
 
+THRUST_UNARY_FUNCTOR_VOID_SPECIALIZATION(logical_not, !THRUST_FWD(x));
+
 /*! \}
  */
 
@@ -879,7 +1009,7 @@ struct logical_not
  *  If \c f is an object of class <tt>bit_and<T></tt>, and \c x and \c y are objects
  *  of class \c T, then <tt>f(x,y)</tt> returns <tt>x&y</tt>.
  *
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>,
  *          and if \c x and \c y are objects of type \p T, then <tt>x&y</tt> must be defined and must have a return type that is convertible to \c T.
  *
  *  The following code snippet demonstrates how to use <tt>bit_and</tt> to take
@@ -907,7 +1037,7 @@ struct logical_not
  *
  *  \see binary_function
  */
-template<typename T>
+template<typename T = void>
 struct bit_and
 {
   /*! \typedef first_argument_type
@@ -928,14 +1058,20 @@ struct bit_and
   /*! Function call operator. The return value is <tt>lhs & rhs</tt>.
    */
   __thrust_exec_check_disable__
-  __host__ __device__ T operator()(const T &lhs, const T &rhs) const {return lhs & rhs;}
+  __host__ __device__
+  constexpr T operator()(const T &lhs, const T &rhs) const
+  {
+    return lhs & rhs;
+  }
 }; // end bit_and
 
+THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(bit_and, &);
+
 /*! \p bit_or is a function object. Specifically, it is an Adaptable Binary Function.
  *  If \c f is an object of class <tt>bit_and<T></tt>, and \c x and \c y are objects
  *  of class \c T, then <tt>f(x,y)</tt> returns <tt>x|y</tt>.
  *
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>,
  *          and if \c x and \c y are objects of type \p T, then <tt>x|y</tt> must be defined and must have a return type that is convertible to \c T.
  *
  *  The following code snippet demonstrates how to use <tt>bit_or</tt> to take
@@ -963,7 +1099,7 @@ struct bit_and
  *
  *  \see binary_function
  */
-template<typename T>
+template<typename T = void>
 struct bit_or
 {
   /*! \typedef first_argument_type
@@ -984,14 +1120,20 @@ struct bit_or
   /*! Function call operator. The return value is <tt>lhs | rhs</tt>.
    */
   __thrust_exec_check_disable__
-  __host__ __device__ T operator()(const T &lhs, const T &rhs) const {return lhs | rhs;}
+  __host__ __device__
+  constexpr T operator()(const T &lhs, const T &rhs) const
+  {
+    return lhs | rhs;
+  }
 }; // end bit_or
 
+THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(bit_or, |);
+
 /*! \p bit_xor is a function object. Specifically, it is an Adaptable Binary Function.
  *  If \c f is an object of class <tt>bit_and<T></tt>, and \c x and \c y are objects
  *  of class \c T, then <tt>f(x,y)</tt> returns <tt>x^y</tt>.
  *
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>,
  *          and if \c x and \c y are objects of type \p T, then <tt>x^y</tt> must be defined and must have a return type that is convertible to \c T.
  *
  *  The following code snippet demonstrates how to use <tt>bit_xor</tt> to take
@@ -1019,7 +1161,7 @@ struct bit_or
  *
  *  \see binary_function
  */
-template<typename T>
+template<typename T = void>
 struct bit_xor
 {
   /*! \typedef first_argument_type
@@ -1040,9 +1182,15 @@ struct bit_xor
   /*! Function call operator. The return value is <tt>lhs ^ rhs</tt>.
    */
   __thrust_exec_check_disable__
-  __host__ __device__ T operator()(const T &lhs, const T &rhs) const {return lhs ^ rhs;}
+  __host__ __device__
+  constexpr T operator()(const T &lhs, const T &rhs) const
+  {
+    return lhs ^ rhs;
+  }
 }; // end bit_xor
 
+THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(bit_xor, ^);
+
 /*! \}
  */
 
@@ -1068,10 +1216,10 @@ struct bit_xor
  *  assert(x == id(x));
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/identity.html
+ *  \see https://en.cppreference.com/w/cpp/utility/functional/identity
  *  \see unary_function
  */
-template<typename T>
+template<typename T = void>
 struct identity
 {
   /*! \typedef argument_type
@@ -1087,15 +1235,21 @@ struct identity
   /*! Function call operator. The return value is <tt>x</tt>.
    */
   __thrust_exec_check_disable__
-  __host__ __device__ const T &operator()(const T &x) const {return x;}
+  __host__ __device__
+  constexpr const T &operator()(const T &x) const
+  {
+    return x;
+  }
 }; // end identity
 
+THRUST_UNARY_FUNCTOR_VOID_SPECIALIZATION(identity, THRUST_FWD(x));
+
 /*! \p maximum is a function object that takes two arguments and returns the greater
  *  of the two. Specifically, it is an Adaptable Binary Function. If \c f is an
  *  object of class <tt>maximum<T></tt> and \c x and \c y are objects of class \c T
  *  <tt>f(x,y)</tt> returns \c x if <tt>x > y</tt> and \c y, otherwise.
  *
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>.
  *
  *  The following code snippet demonstrates that \p maximum returns its
  *  greater argument.
@@ -1114,7 +1268,7 @@ struct identity
  *  \see min
  *  \see binary_function
  */
-template<typename T>
+template<typename T = void>
 struct maximum
 {
   /*! \typedef first_argument_type
@@ -1135,15 +1289,23 @@ struct maximum
   /*! Function call operator. The return value is <tt>rhs < lhs ? lhs : rhs</tt>.
    */
   __thrust_exec_check_disable__
-  __host__ __device__ T operator()(const T &lhs, const T &rhs) const {return lhs < rhs ? rhs : lhs;}
+  __host__ __device__
+  constexpr T operator()(const T &lhs, const T &rhs) const
+  {
+    return lhs < rhs ? rhs : lhs;
+  }
 }; // end maximum
 
+THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION(maximum,
+                                          t1 < t2 ? THRUST_FWD(t2)
+                                                  : THRUST_FWD(t1));
+
 /*! \p minimum is a function object that takes two arguments and returns the lesser
  *  of the two. Specifically, it is an Adaptable Binary Function. If \c f is an
  *  object of class <tt>minimum<T></tt> and \c x and \c y are objects of class \c T
  *  <tt>f(x,y)</tt> returns \c x if <tt>x < y</tt> and \c y, otherwise.
  *
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>.
  *
  *  The following code snippet demonstrates that \p minimum returns its
  *  lesser argument.
@@ -1162,7 +1324,7 @@ struct maximum
  *  \see max
  *  \see binary_function
  */
-template<typename T>
+template<typename T = void>
 struct minimum
 {
   /*! \typedef first_argument_type
@@ -1183,10 +1345,18 @@ struct minimum
   /*! Function call operator. The return value is <tt>lhs < rhs ? lhs : rhs</tt>.
    */
   __thrust_exec_check_disable__
-  __host__ __device__ T operator()(const T &lhs, const T &rhs) const {return lhs < rhs ? lhs : rhs;}
+  __host__ __device__
+  constexpr T operator()(const T &lhs, const T &rhs) const
+  {
+    return lhs < rhs ? lhs : rhs;
+  }
 }; // end minimum
 
-/*! \p project1st is a function object that takes two arguments and returns 
+THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION(minimum,
+                                          t1 < t2 ? THRUST_FWD(t1)
+                                                  : THRUST_FWD(t2));
+
+/*! \p project1st is a function object that takes two arguments and returns
  *  its first argument; the second argument is unused. It is essentially a
  *  generalization of identity to the case of a Binary Function.
  *
@@ -1204,7 +1374,7 @@ struct minimum
  *  \see project2nd
  *  \see binary_function
  */
-template<typename T1, typename T2>
+template<typename T1 = void, typename T2 = void>
 struct project1st
 {
   /*! \typedef first_argument_type
@@ -1224,10 +1394,29 @@ struct project1st
 
   /*! Function call operator. The return value is <tt>lhs</tt>.
    */
-  __host__ __device__ const T1 &operator()(const T1 &lhs, const T2 & /*rhs*/) const {return lhs;}
+  __host__ __device__
+  constexpr const T1 &operator()(const T1 &lhs, const T2 & /*rhs*/) const
+  {
+    return lhs;
+  }
 }; // end project1st
 
-/*! \p project2nd is a function object that takes two arguments and returns 
+template <>
+struct project1st<void, void>
+{
+  using is_transparent = void;
+  __thrust_exec_check_disable__
+  template <typename T1, typename T2>
+  __host__ __device__
+  constexpr auto operator()(T1&& t1, T2&&) const
+    noexcept(noexcept(THRUST_FWD(t1)))
+    THRUST_TRAILING_RETURN(decltype(THRUST_FWD(t1)))
+  {
+    return THRUST_FWD(t1);
+  }
+};
+
+/*! \p project2nd is a function object that takes two arguments and returns
  *  its second argument; the first argument is unused. It is essentially a
  *  generalization of identity to the case of a Binary Function.
  *
@@ -1245,7 +1434,7 @@ struct project1st
  *  \see project1st
  *  \see binary_function
  */
-template<typename T1, typename T2>
+template<typename T1 = void, typename T2 = void>
 struct project2nd
 {
   /*! \typedef first_argument_type
@@ -1265,13 +1454,31 @@ struct project2nd
 
   /*! Function call operator. The return value is <tt>rhs</tt>.
    */
-  __host__ __device__ const T2 &operator()(const T1 &/*lhs*/, const T2 &rhs) const {return rhs;}
+  __host__ __device__
+  constexpr const T2 &operator()(const T1 &/*lhs*/, const T2 &rhs) const
+  {
+    return rhs;
+  }
 }; // end project2nd
 
+template <>
+struct project2nd<void, void>
+{
+  using is_transparent = void;
+  __thrust_exec_check_disable__
+  template <typename T1, typename T2>
+  __host__ __device__
+  constexpr auto operator()(T1&&, T2&& t2) const
+  noexcept(noexcept(THRUST_FWD(t2)))
+  THRUST_TRAILING_RETURN(decltype(THRUST_FWD(t2)))
+  {
+    return THRUST_FWD(t2);
+  }
+};
+
 /*! \}
  */
 
-
 // odds and ends
 
 /*! \addtogroup function_object_adaptors
@@ -1286,11 +1493,11 @@ struct project2nd
  *  There is rarely any reason to construct a <tt>unary_negate</tt> directly;
  *  it is almost always easier to use the helper function not1.
  *
- *  \see http://www.sgi.com/tech/stl/unary_negate.html
+ *  \see https://en.cppreference.com/w/cpp/utility/functional/unary_negate
  *  \see not1
  */
 template<typename Predicate>
-struct unary_negate 
+struct unary_negate
     : public thrust::unary_function<typename Predicate::argument_type, bool>
 {
   /*! Constructor takes a \p Predicate object to negate.
@@ -1323,7 +1530,7 @@ struct unary_negate
  *  \return A new object, <tt>npred</tt> such that <tt>npred(x)</tt> always returns
  *          the same value as <tt>!pred(x)</tt>.
  *
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/AdaptablePredicate.html">Adaptable Predicate</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/unary_negate">Adaptable Predicate</a>.
  *
  *  \see unary_negate
  *  \see not2
@@ -1332,7 +1539,7 @@ template<typename Predicate>
   __host__ __device__
   unary_negate<Predicate> not1(const Predicate &pred);
 
-/*! \p binary_negate is a function object adaptor: it is an Adaptable Binary 
+/*! \p binary_negate is a function object adaptor: it is an Adaptable Binary
  *  Predicate that represents the logical negation of some other Adaptable
  *  Binary Predicate. That is: if \c f is an object of class <tt>binary_negate<AdaptablePredicate></tt>,
  *  then there exists an object \c pred of class \c AdaptableBinaryPredicate
@@ -1340,7 +1547,7 @@ template<typename Predicate>
  *  There is rarely any reason to construct a <tt>binary_negate</tt> directly;
  *  it is almost always easier to use the helper function not2.
  *
- *  \see http://www.sgi.com/tech/stl/binary_negate.html
+ *  \see https://en.cppreference.com/w/cpp/utility/functional/binary_negate
  */
 template<typename Predicate>
 struct binary_negate
@@ -1359,8 +1566,8 @@ struct binary_negate
   __thrust_exec_check_disable__
   __host__ __device__
   bool operator()(const typename Predicate::first_argument_type& x, const typename Predicate::second_argument_type& y)
-  { 
-      return !pred(x,y); 
+  {
+      return !pred(x,y);
   }
 
   /*! \cond
@@ -1381,7 +1588,7 @@ struct binary_negate
  *  \return A new object, <tt>npred</tt> such that <tt>npred(x,y)</tt> always returns
  *          the same value as <tt>!pred(x,y)</tt>.
  *
- *  \tparam Binary Predicate is a model of <a href="http://www.sgi.com/tech/stl/AdaptableBinaryPredicate.html">Adaptable Binary Predicate</a>.
+ *  \tparam Binary Predicate is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/AdaptableBinaryPredicate">Adaptable Binary Predicate</a>.
  *
  *  \see binary_negate
  *  \see not1
@@ -1448,92 +1655,52 @@ namespace placeholders
 
 /*! \p thrust::placeholders::_1 is the placeholder for the first function parameter.
  */
-#ifdef __CUDA_ARCH__
-static const __device__ thrust::detail::functional::placeholder<0>::type _1;
-#else
-static const thrust::detail::functional::placeholder<0>::type _1;
-#endif
+THRUST_INLINE_CONSTANT thrust::detail::functional::placeholder<0>::type _1;
 
 
 /*! \p thrust::placeholders::_2 is the placeholder for the second function parameter.
  */
-#ifdef __CUDA_ARCH__
-static const __device__ thrust::detail::functional::placeholder<1>::type _2;
-#else
-static const thrust::detail::functional::placeholder<1>::type _2;
-#endif
+THRUST_INLINE_CONSTANT thrust::detail::functional::placeholder<1>::type _2;
 
 
 /*! \p thrust::placeholders::_3 is the placeholder for the third function parameter.
  */
-#ifdef __CUDA_ARCH__
-static const __device__ thrust::detail::functional::placeholder<2>::type _3;
-#else
-static const thrust::detail::functional::placeholder<2>::type _3;
-#endif
+THRUST_INLINE_CONSTANT thrust::detail::functional::placeholder<2>::type _3;
 
 
 /*! \p thrust::placeholders::_4 is the placeholder for the fourth function parameter.
  */
-#ifdef __CUDA_ARCH__
-static const __device__ thrust::detail::functional::placeholder<3>::type _4;
-#else
-static const thrust::detail::functional::placeholder<3>::type _4;
-#endif
+THRUST_INLINE_CONSTANT thrust::detail::functional::placeholder<3>::type _4;
 
 
 /*! \p thrust::placeholders::_5 is the placeholder for the fifth function parameter.
  */
-#ifdef __CUDA_ARCH__
-static const __device__ thrust::detail::functional::placeholder<4>::type _5;
-#else
-static const thrust::detail::functional::placeholder<4>::type _5;
-#endif
+THRUST_INLINE_CONSTANT thrust::detail::functional::placeholder<4>::type _5;
 
 
 /*! \p thrust::placeholders::_6 is the placeholder for the sixth function parameter.
  */
-#ifdef __CUDA_ARCH__
-static const __device__ thrust::detail::functional::placeholder<5>::type _6;
-#else
-static const thrust::detail::functional::placeholder<5>::type _6;
-#endif
+THRUST_INLINE_CONSTANT thrust::detail::functional::placeholder<5>::type _6;
 
 
 /*! \p thrust::placeholders::_7 is the placeholder for the seventh function parameter.
  */
-#ifdef __CUDA_ARCH__
-static const __device__ thrust::detail::functional::placeholder<6>::type _7;
-#else
-static const thrust::detail::functional::placeholder<6>::type _7;
-#endif
+THRUST_INLINE_CONSTANT thrust::detail::functional::placeholder<6>::type _7;
 
 
 /*! \p thrust::placeholders::_8 is the placeholder for the eighth function parameter.
  */
-#ifdef __CUDA_ARCH__
-static const __device__ thrust::detail::functional::placeholder<7>::type _8;
-#else
-static const thrust::detail::functional::placeholder<7>::type _8;
-#endif
+THRUST_INLINE_CONSTANT thrust::detail::functional::placeholder<7>::type _8;
 
 
 /*! \p thrust::placeholders::_9 is the placeholder for the ninth function parameter.
  */
-#ifdef __CUDA_ARCH__
-static const __device__ thrust::detail::functional::placeholder<8>::type _9;
-#else
-static const thrust::detail::functional::placeholder<8>::type _9;
-#endif
+THRUST_INLINE_CONSTANT thrust::detail::functional::placeholder<8>::type _9;
 
 
 /*! \p thrust::placeholders::_10 is the placeholder for the tenth function parameter.
  */
-#ifdef __CUDA_ARCH__
-static const __device__ thrust::detail::functional::placeholder<9>::type _10;
-#else
-static const thrust::detail::functional::placeholder<9>::type _10;
-#endif
+THRUST_INLINE_CONSTANT thrust::detail::functional::placeholder<9>::type _10;
 
 
 } // end placeholders
@@ -1542,9 +1709,11 @@ static const thrust::detail::functional::placeholder<9>::type _10;
 /*! \} // placeholder_objects
  */
 
+#undef THRUST_UNARY_FUNCTOR_VOID_SPECIALIZATION
+#undef THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION
+#undef THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP
 
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/functional.inl>
 #include <thrust/detail/functional/operators.h>
-
diff --git a/thrust/future.h b/thrust/future.h
index 90dcc705d..d8fb7544b 100644
--- a/thrust/future.h
+++ b/thrust/future.h
@@ -21,10 +21,9 @@
 #pragma once
 
 #include <thrust/detail/config.h>
-#include <thrust/detail/cpp11_required.h>
-#include <thrust/detail/modern_gcc_required.h>
+#include <thrust/detail/cpp14_required.h>
 
-#if THRUST_CPP_DIALECT >= 2011 && !defined(THRUST_LEGACY_GCC)
+#if THRUST_CPP_DIALECT >= 2014
 
 #include <thrust/execution_policy.h>
 #include <thrust/detail/static_assert.h>
@@ -55,7 +54,7 @@
   #include __THRUST_DEVICE_SYSTEM_FUTURE_HEADER
 #undef __THRUST_DEVICE_SYSTEM_FUTURE_HEADER
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
 
 ///////////////////////////////////////////////////////////////////////////////
 
@@ -172,7 +171,6 @@ using thrust::system::__THRUST_DEVICE_SYSTEM_NAMESPACE::when_all;
 
 ///////////////////////////////////////////////////////////////////////////////
 
-THRUST_END_NS
+THRUST_NAMESPACE_END
 
 #endif
-
diff --git a/thrust/gather.h b/thrust/gather.h
index 276650a6c..41acc22a3 100644
--- a/thrust/gather.h
+++ b/thrust/gather.h
@@ -24,9 +24,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/detail/execution_policy.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup gathering
  *  \ingroup copying
@@ -48,11 +46,12 @@ namespace thrust
  *  \param result Beginning of the destination range.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
- *  \tparam RandomAccessIterator must be a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a> and \c RandomAccessIterator's \c value_type must be convertible to \c OutputIterator's \c value_type.
- *  \tparam OutputIterator must be a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam InputIterator must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a> and \c InputIterator's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
+ *  \tparam RandomAccessIterator must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/random_access_iterator">Random Access Iterator</a> and \c RandomAccessIterator's \c value_type must be convertible to \c OutputIterator's \c value_type.
+ *  \tparam OutputIterator must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *
  *  \pre The range <tt>[map_first, map_last)</tt> shall not overlap the range <tt>[result, result + (map_last - map_first))</tt>.
+ *  \pre The input data shall not overlap the range <tt>[result, result + (map_last - map_first))</tt>.
  *
  *  \remark \p gather is the inverse of thrust::scatter.
  *
@@ -103,11 +102,12 @@ __host__ __device__
  *  \param input_first Beginning of the source range.
  *  \param result Beginning of the destination range.
  *
- *  \tparam InputIterator must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
- *  \tparam RandomAccessIterator must be a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a> and \c RandomAccessIterator's \c value_type must be convertible to \c OutputIterator's \c value_type.
- *  \tparam OutputIterator must be a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam InputIterator must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a> and \c InputIterator's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
+ *  \tparam RandomAccessIterator must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/random_access_iterator">Random Access Iterator</a> and \c RandomAccessIterator's \c value_type must be convertible to \c OutputIterator's \c value_type.
+ *  \tparam OutputIterator must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *
  *  \pre The range <tt>[map_first, map_last)</tt> shall not overlap the range <tt>[result, result + (map_last - map_first))</tt>.
+ *  \pre The input data shall not overlap the range <tt>[result, result + (map_last - map_first))</tt>.
  *
  *  \remark \p gather is the inverse of thrust::scatter.
  *
@@ -159,13 +159,14 @@ template<typename InputIterator,
  *  \param result Beginning of the destination range.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator1's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
- *  \tparam InputIterator2 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator2's \c value_type must be convertible to \c bool.
- *  \tparam RandomAccessIterator must be a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access iterator</a> and \c RandomAccessIterator's \c value_type must be convertible to \c OutputIterator's \c value_type.
- *  \tparam OutputIterator must be a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam InputIterator1 must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a> and \c InputIterator1's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
+ *  \tparam InputIterator2 must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a> and \c InputIterator2's \c value_type must be convertible to \c bool.
+ *  \tparam RandomAccessIterator must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/random_access_iterator">Random Access iterator</a> and \c RandomAccessIterator's \c value_type must be convertible to \c OutputIterator's \c value_type.
+ *  \tparam OutputIterator must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *
  *  \pre The range <tt>[map_first, map_last)</tt> shall not overlap the range <tt>[result, result + (map_last - map_first))</tt>.
  *  \pre The range <tt>[stencil, stencil + (map_last - map_first))</tt> shall not overlap the range <tt>[result, result + (map_last - map_first))</tt>.
+ *  \pre The input data shall not overlap the range <tt>[result, result + (map_last - map_first))</tt>.
  *
  *  \remark \p gather_if is the inverse of \p scatter_if.
  *
@@ -225,13 +226,14 @@ __host__ __device__
  *  \param input_first Beginning of the source range.
  *  \param result Beginning of the destination range.
  *
- *  \tparam InputIterator1 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator1's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
- *  \tparam InputIterator2 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator2's \c value_type must be convertible to \c bool.
- *  \tparam RandomAccessIterator must be a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access iterator</a> and \c RandomAccessIterator's \c value_type must be convertible to \c OutputIterator's \c value_type.
- *  \tparam OutputIterator must be a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam InputIterator1 must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a> and \c InputIterator1's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
+ *  \tparam InputIterator2 must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a> and \c InputIterator2's \c value_type must be convertible to \c bool.
+ *  \tparam RandomAccessIterator must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/random_access_iterator">Random Access iterator</a> and \c RandomAccessIterator's \c value_type must be convertible to \c OutputIterator's \c value_type.
+ *  \tparam OutputIterator must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *
  *  \pre The range <tt>[map_first, map_last)</tt> shall not overlap the range <tt>[result, result + (map_last - map_first))</tt>.
  *  \pre The range <tt>[stencil, stencil + (map_last - map_first))</tt> shall not overlap the range <tt>[result, result + (map_last - map_first))</tt>.
+ *  \pre The input data shall not overlap the range <tt>[result, result + (map_last - map_first))</tt>.
  *
  *  \remark \p gather_if is the inverse of \p scatter_if.
  *
@@ -291,14 +293,15 @@ template<typename InputIterator1,
  *  \param pred Predicate to apply to the stencil values.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator1's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
- *  \tparam InputIterator2 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator2's \c value_type must be convertible to \c Predicate's \c argument_type.
- *  \tparam RandomAccessIterator must be a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access iterator</a> and \c RandomAccessIterator's \c value_type must be convertible to \c OutputIterator's \c value_type.
- *  \tparam OutputIterator must be a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam Predicate must be a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam InputIterator1 must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a> and \c InputIterator1's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
+ *  \tparam InputIterator2 must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a> and \c InputIterator2's \c value_type must be convertible to \c Predicate's \c argument_type.
+ *  \tparam RandomAccessIterator must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/random_access_iterator">Random Access iterator</a> and \c RandomAccessIterator's \c value_type must be convertible to \c OutputIterator's \c value_type.
+ *  \tparam OutputIterator must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam Predicate must be a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \pre The range <tt>[map_first, map_last)</tt> shall not overlap the range <tt>[result, result + (map_last - map_first))</tt>.
  *  \pre The range <tt>[stencil, stencil + (map_last - map_first))</tt> shall not overlap the range <tt>[result, result + (map_last - map_first))</tt>.
+ *  \pre The input data shall not overlap the range <tt>[result, result + (map_last - map_first))</tt>.
  *
  *  \remark \p gather_if is the inverse of \p scatter_if.
  *
@@ -371,14 +374,15 @@ __host__ __device__
  *  \param result Beginning of the destination range.
  *  \param pred Predicate to apply to the stencil values.
  *
- *  \tparam InputIterator1 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator1's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
- *  \tparam InputIterator2 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator2's \c value_type must be convertible to \c Predicate's \c argument_type.
- *  \tparam RandomAccessIterator must be a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access iterator</a> and \c RandomAccessIterator's \c value_type must be convertible to \c OutputIterator's \c value_type.
- *  \tparam OutputIterator must be a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam Predicate must be a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam InputIterator1 must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a> and \c InputIterator1's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
+ *  \tparam InputIterator2 must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a> and \c InputIterator2's \c value_type must be convertible to \c Predicate's \c argument_type.
+ *  \tparam RandomAccessIterator must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/random_access_iterator">Random Access iterator</a> and \c RandomAccessIterator's \c value_type must be convertible to \c OutputIterator's \c value_type.
+ *  \tparam OutputIterator must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam Predicate must be a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \pre The range <tt>[map_first, map_last)</tt> shall not overlap the range <tt>[result, result + (map_last - map_first))</tt>.
  *  \pre The range <tt>[stencil, stencil + (map_last - map_first))</tt> shall not overlap the range <tt>[result, result + (map_last - map_first))</tt>.
+ *  \pre The input data shall not overlap the range <tt>[result, result + (map_last - map_first))</tt>.
  *
  *  \remark \p gather_if is the inverse of \p scatter_if.
  *
@@ -435,7 +439,7 @@ template<typename InputIterator1,
 /*! \} // gathering
  */
 
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/gather.inl>
 
diff --git a/thrust/generate.h b/thrust/generate.h
index a651dd0dc..d47295344 100644
--- a/thrust/generate.h
+++ b/thrust/generate.h
@@ -24,9 +24,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/detail/execution_policy.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup transformations
  *  \{
@@ -45,9 +43,9 @@ namespace thrust
  *             elements in the range <tt>[first,last)</tt>.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator is mutable.
- *  \tparam Generator is a model of <a href="http://www.sgi.com/tech/stl/Generator.html">Generator</a>,
+ *  \tparam Generator is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional">Generator</a>,
  *          and \p Generator's \c result_type is convertible to \p ForwardIterator's \c value_type.
  *
  *  The following code snippet demonstrates how to fill a \c host_vector with random numbers,
@@ -67,7 +65,7 @@ namespace thrust
  *  \endcode
  *
  *  \see generate_n
- *  \see http://www.sgi.com/tech/stl/generate.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/generate
  */
 template<typename DerivedPolicy,
          typename ForwardIterator,
@@ -87,9 +85,9 @@ __host__ __device__
  *  \param gen A function argument, taking no parameters, used to generate values to assign to
  *             elements in the range <tt>[first,last)</tt>.
  *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator is mutable.
- *  \tparam Generator is a model of <a href="http://www.sgi.com/tech/stl/Generator.html">Generator</a>,
+ *  \tparam Generator is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional">Generator</a>,
  *          and \p Generator's \c result_type is convertible to \p ForwardIterator's \c value_type.
  *
  *  The following code snippet demonstrates how to fill a \c host_vector with random numbers,
@@ -109,7 +107,7 @@ __host__ __device__
  *  \endcode
  *
  *  \see generate_n
- *  \see http://www.sgi.com/tech/stl/generate.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/generate
  */
 template<typename ForwardIterator,
          typename Generator>
@@ -130,9 +128,9 @@ template<typename ForwardIterator,
  *             elements in the range <tt>[first,first + n)</tt>.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *  \tparam Size is an integral type (either signed or unsigned).
- *  \tparam Generator is a model of <a href="http://www.sgi.com/tech/stl/Generator.html">Generator</a>,
+ *  \tparam Generator is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional">Generator</a>,
  *          and \p Generator's \c result_type is convertible to a type in \p OutputIterator's set of \c value_types.
  *
  *  The following code snippet demonstrates how to fill a \c host_vector with random numbers,
@@ -152,7 +150,7 @@ template<typename ForwardIterator,
  *  \endcode
  *
  *  \see generate
- *  \see http://www.sgi.com/tech/stl/generate.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/generate
  */
 template<typename DerivedPolicy,
          typename OutputIterator,
@@ -173,9 +171,9 @@ __host__ __device__
  *  \param gen A function argument, taking no parameters, used to generate values to assign to
  *             elements in the range <tt>[first,first + n)</tt>.
  *
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *  \tparam Size is an integral type (either signed or unsigned).
- *  \tparam Generator is a model of <a href="http://www.sgi.com/tech/stl/Generator.html">Generator</a>,
+ *  \tparam Generator is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional">Generator</a>,
  *          and \p Generator's \c result_type is convertible to a type in \p OutputIterator's set of \c value_types.
  *
  *  The following code snippet demonstrates how to fill a \c host_vector with random numbers,
@@ -194,7 +192,7 @@ __host__ __device__
  *  \endcode
  *
  *  \see generate
- *  \see http://www.sgi.com/tech/stl/generate.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/generate
  */
 template<typename OutputIterator,
          typename Size,
@@ -207,7 +205,7 @@ template<typename OutputIterator,
 /*! \} // end transformations
  */
 
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/generate.inl>
 
diff --git a/thrust/host_vector.h b/thrust/host_vector.h
index 047949089..bb925ea9c 100644
--- a/thrust/host_vector.h
+++ b/thrust/host_vector.h
@@ -16,22 +16,21 @@
 
 
 /*! \file host_vector.h
- *  \brief A dynamically-sizable array of elements which reside in the "host" memory space
+ *  \brief A dynamically-sizable array of elements which resides in memory
+ *         accessible to hosts.
  */
 
 #pragma once
 
 #include <thrust/detail/config.h>
-#include <memory>
+#include <thrust/detail/memory_wrapper.h>
 #include <thrust/detail/vector_base.h>
+
+#include <initializer_list>
 #include <vector>
 #include <utility>
 
-namespace thrust
-{
-
-// forward declaration of device_vector
-template<typename T, typename Alloc> class device_vector;
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup container_classes Container Classes
  *  \addtogroup host_containers Host Containers
@@ -43,11 +42,12 @@ template<typename T, typename Alloc> class device_vector;
  *  constant time removal of elements at the end, and linear time insertion
  *  and removal of elements at the beginning or in the middle. The number of
  *  elements in a \p host_vector may vary dynamically; memory management is
- *  automatic. The memory associated with a \p host_vector resides in the memory
- *  space of the host associated with a parallel device.
+ *  automatic. The memory associated with a \p host_vector resides in memory
+ *  accessible to hosts.
  *
- *  \see http://www.sgi.com/tech/stl/Vector.html
+ *  \see https://en.cppreference.com/w/cpp/container/vector
  *  \see device_vector
+ *  \see universal_vector
  */
 template<typename T, typename Alloc = std::allocator<T> >
   class host_vector
@@ -135,7 +135,7 @@ template<typename T, typename Alloc = std::allocator<T> >
     host_vector(const host_vector &v, const Alloc &alloc)
       :Parent(v,alloc) {}
 
-  #if __cplusplus >= 201103L
+  #if THRUST_CPP_DIALECT >= 2011
     /*! Move constructor moves from another host_vector.
      *  \param v The host_vector to move.
      */
@@ -159,7 +159,7 @@ template<typename T, typename Alloc = std::allocator<T> >
   host_vector &operator=(const host_vector &v)
   { Parent::operator=(v); return *this; }
 
-  #if __cplusplus >= 201103L
+  #if THRUST_CPP_DIALECT >= 2011
     /*! Move assign operator moves from another host_vector.
      *  \param v The host_vector to move.
      */
@@ -200,20 +200,43 @@ template<typename T, typename Alloc = std::allocator<T> >
     host_vector &operator=(const std::vector<OtherT,OtherAlloc> &v)
     { Parent::operator=(v); return *this;}
 
-    /*! Copy constructor copies from an exemplar \p device_vector with possibly different type.
-     *  \param v The \p device_vector to copy.
+    /*! Copy construct from a \p vector_base whose element type is convertible
+     *  to \c T.
+     *
+     *  \param v The \p vector_base to copy.
      */
     template<typename OtherT, typename OtherAlloc>
     __host__
-    host_vector(const device_vector<OtherT,OtherAlloc> &v);
+    host_vector(const detail::vector_base<OtherT,OtherAlloc> &v)
+      :Parent(v) {}
 
-    /*! Assign operator copies from an exemplar \p device_vector.
-     *  \param v The \p device_vector to copy.
+    /*! Assign a \p vector_base whose element type is convertible to \c T.
+     *
+     *  \param v The \p vector_base to copy.
      */
     template<typename OtherT, typename OtherAlloc>
     __host__
-    host_vector &operator=(const device_vector<OtherT,OtherAlloc> &v)
+    host_vector &operator=(const detail::vector_base<OtherT,OtherAlloc> &v)
     { Parent::operator=(v); return *this; }
+    
+    /*! This constructor builds a \p host_vector from an intializer_list.
+     *  \param il The intializer_list.
+     */
+    host_vector(std::initializer_list<T> il)
+      :Parent(il) {}
+      
+    /*! This constructor builds a \p host_vector from an intializer_list.
+     *  \param il The intializer_list.
+     *  \param alloc The allocator to use by this host_vector.
+     */
+    host_vector(std::initializer_list<T> il, const Alloc &alloc)
+      :Parent(il, alloc) {}
+      
+    /*! Assign an \p intializer_list with a matching element type
+     *  \param il The intializer_list.
+     */
+    host_vector &operator=(std::initializer_list<T> il)
+    { Parent::operator=(il); return *this; }
 
     /*! This constructor builds a \p host_vector from a range.
      *  \param first The beginning of the range.
@@ -450,7 +473,7 @@ template<typename T, typename Alloc = std::allocator<T> >
      *  \param x The exemplar element to copy & insert.
      *  \return An iterator pointing to the newly inserted element.
      */
-    iterator insert(iterator position, const T &x); 
+    iterator insert(iterator position, const T &x);
 
     /*! This method inserts a copy of an exemplar value to a range at the
      *  specified position in this vector.
@@ -466,8 +489,8 @@ template<typename T, typename Alloc = std::allocator<T> >
      *  \param first The beginning of the range to copy.
      *  \param last  The end of the range to copy.
      *
-     *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html>Input Iterator</a>,
-     *                        and \p InputIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>.
+     *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator>Input Iterator</a>,
+     *                        and \p InputIterator's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>.
      */
     template<typename InputIterator>
     void insert(iterator position, InputIterator first, InputIterator last);
@@ -483,7 +506,7 @@ template<typename T, typename Alloc = std::allocator<T> >
      *  \param first The beginning of the range to copy.
      *  \param last  The end of the range to copy.
      *
-     *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator">Input Iterator</a>.
+     *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/named_req/InputIterator">Input Iterator</a>.
      */
     template<typename InputIterator>
     void assign(InputIterator first, InputIterator last);
@@ -493,7 +516,7 @@ template<typename T, typename Alloc = std::allocator<T> >
      */
     allocator_type get_allocator(void) const;
 #endif // end doxygen-only members
-}; // end host_vector
+};
 
 /*! Exchanges the values of two vectors.
  *  \p x The first \p host_vector of interest.
@@ -503,12 +526,9 @@ template<typename T, typename Alloc>
   void swap(host_vector<T,Alloc> &a, host_vector<T,Alloc> &b)
 {
   a.swap(b);
-} // end swap()
+}
 
 /*! \}
  */
 
-} // end thrust
-
-#include <thrust/detail/host_vector.inl>
-
+THRUST_NAMESPACE_END
diff --git a/thrust/inner_product.h b/thrust/inner_product.h
index 0206eff38..80068cf0c 100644
--- a/thrust/inner_product.h
+++ b/thrust/inner_product.h
@@ -24,9 +24,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/detail/execution_policy.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup reductions
  *  \{
@@ -53,9 +51,9 @@ namespace thrust
  *          and <tt>[first2, last2)</tt> plus \p init.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam OutputType is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *  \tparam OutputType is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>,
  *          and if \c x is an object of type \p OutputType, and \c y is an object of \p InputIterator1's \c value_type,
  *          and \c z is an object of \p InputIterator2's \c value_type, then <tt>x + y * z</tt> is defined
  *          and is convertible to \p OutputType.
@@ -75,7 +73,7 @@ namespace thrust
  *  // result == 31.0f
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/inner_product.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/inner_product
  */
 template<typename DerivedPolicy,
          typename InputIterator1,
@@ -105,9 +103,9 @@ OutputType inner_product(const thrust::detail::execution_policy_base<DerivedPoli
  *  \return The inner product of sequences <tt>[first1, last1)</tt>
  *          and <tt>[first2, last2)</tt> plus \p init.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam OutputType is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *  \tparam OutputType is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>,
  *          and if \c x is an object of type \p OutputType, and \c y is an object of \p InputIterator1's \c value_type,
  *          and \c z is an object of \p InputIterator2's \c value_type, then <tt>x + y * z</tt> is defined
  *          and is convertible to \p OutputType.
@@ -126,7 +124,7 @@ OutputType inner_product(const thrust::detail::execution_policy_base<DerivedPoli
  *  // result == 31.0f
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/inner_product.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/inner_product
  */
 template<typename InputIterator1, typename InputIterator2, typename OutputType>
 OutputType inner_product(InputIterator1 first1, InputIterator1 last1,
@@ -154,15 +152,15 @@ OutputType inner_product(InputIterator1 first1, InputIterator1 last1,
  *  \return The inner product of sequences <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt>.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator1's \c value_type is convertible to \p BinaryFunction2's \c first_argument_type.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>.
  *          and \p InputIterator2's \c value_type is convertible to \p BinaryFunction2's \c second_argument_type.
- *  \tparam OutputType is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *  \tparam OutputType is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>,
  *          and \p OutputType is convertible to \p BinaryFunction1's \c first_argument_type.
- *  \tparam BinaryFunction1 is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>,
+ *  \tparam BinaryFunction1 is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/binary_function">Binary Function</a>,
  *          and \p BinaryFunction1's \c return_type is convertible to \p OutputType.
- *  \tparam BinaryFunction2 is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>,
+ *  \tparam BinaryFunction2 is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/binary_function">Binary Function</a>,
  *          and \p BinaryFunction2's \c return_type is convertible to \p BinaryFunction1's \c second_argument_type.
  * 
  *  \code
@@ -181,7 +179,7 @@ OutputType inner_product(InputIterator1 first1, InputIterator1 last1,
  *  // result == 31.0f
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/inner_product.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/inner_product
  */
 template<typename DerivedPolicy,
          typename InputIterator1,
@@ -219,15 +217,15 @@ OutputType inner_product(const thrust::detail::execution_policy_base<DerivedPoli
  *  \param binary_op2 Generalized multiplication operation.
  *  \return The inner product of sequences <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt>.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator1's \c value_type is convertible to \p BinaryFunction2's \c first_argument_type.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>.
  *          and \p InputIterator2's \c value_type is convertible to \p BinaryFunction2's \c second_argument_type.
- *  \tparam OutputType is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *  \tparam OutputType is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>,
  *          and \p OutputType is convertible to \p BinaryFunction1's \c first_argument_type.
- *  \tparam BinaryFunction1 is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>,
+ *  \tparam BinaryFunction1 is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/binary_function">Binary Function</a>,
  *          and \p BinaryFunction1's \c return_type is convertible to \p OutputType.
- *  \tparam BinaryFunction2 is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>,
+ *  \tparam BinaryFunction2 is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/binary_function">Binary Function</a>,
  *          and \p BinaryFunction2's \c return_type is convertible to \p BinaryFunction1's \c second_argument_type.
  * 
  *  \code
@@ -245,7 +243,7 @@ OutputType inner_product(const thrust::detail::execution_policy_base<DerivedPoli
  *  // result == 31.0f
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/inner_product.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/inner_product
  */
 template<typename InputIterator1, typename InputIterator2, typename OutputType,
          typename BinaryFunction1, typename BinaryFunction2>
@@ -258,7 +256,7 @@ OutputType inner_product(InputIterator1 first1, InputIterator1 last1,
  *  \} // end reductions
  */
 
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/inner_product.inl>
 
diff --git a/thrust/iterator/constant_iterator.h b/thrust/iterator/constant_iterator.h
index cda852918..c6eec28e7 100644
--- a/thrust/iterator/constant_iterator.h
+++ b/thrust/iterator/constant_iterator.h
@@ -26,8 +26,7 @@
 #include <thrust/iterator/detail/constant_iterator_base.h>
 #include <thrust/iterator/iterator_facade.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup iterators
  *  \{
@@ -217,11 +216,11 @@ template<typename Value,
  *
  *  \see constant_iterator
  */
-template<typename V, typename I>
+template<typename ValueT, typename IndexT>
 inline __host__ __device__
-constant_iterator<V,I> make_constant_iterator(V x, I i = int())
+constant_iterator<ValueT, IndexT> make_constant_iterator(ValueT x, IndexT i = int())
 {
-  return constant_iterator<V,I>(x, i);
+  return constant_iterator<ValueT, IndexT>(x, i);
 } // end make_constant_iterator()
 
 
@@ -247,5 +246,5 @@ constant_iterator<V> make_constant_iterator(V x)
 /*! \} // end iterators
  */
 
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/iterator/counting_iterator.h b/thrust/iterator/counting_iterator.h
index dc5de9ae0..f66cb97ef 100644
--- a/thrust/iterator/counting_iterator.h
+++ b/thrust/iterator/counting_iterator.h
@@ -22,7 +22,7 @@
 
 /*
  * Copyright David Abrahams 2003.
- * 
+ *
  * Distributed under the Boost Software License, Version 1.0.
  * (See accompanying NOTICE file for the complete license)
  *
@@ -39,8 +39,7 @@
 // #include the details first
 #include <thrust/iterator/detail/counting_iterator.inl>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup iterators
  *  \{
@@ -65,14 +64,14 @@ namespace thrust
  *  // create iterators
  *  thrust::counting_iterator<int> first(10);
  *  thrust::counting_iterator<int> last = first + 3;
- *   
+ *
  *  first[0]   // returns 10
  *  first[1]   // returns 11
  *  first[100] // returns 110
- *   
+ *
  *  // sum of [first, last)
  *  thrust::reduce(first, last);   // returns 33 (i.e. 10 + 11 + 12)
- *   
+ *
  *  // initialize vector to [0,1,2,..]
  *  thrust::counting_iterator<int> iter(0);
  *  thrust::device_vector<int> vec(500);
@@ -89,11 +88,11 @@ namespace thrust
  *  #include <thrust/copy.h>
  *  #include <thrust/functional.h>
  *  #include <thrust/device_vector.h>
- *   
+ *
  *  int main()
  *  {
  *   // this example computes indices for all the nonzero values in a sequence
- *   
+ *
  *   // sequence of zero and nonzero values
  *   thrust::device_vector<int> stencil(8);
  *   stencil[0] = 0;
@@ -104,13 +103,13 @@ namespace thrust
  *   stencil[5] = 1;
  *   stencil[6] = 0;
  *   stencil[7] = 1;
- *   
+ *
  *   // storage for the nonzero indices
  *   thrust::device_vector<int> indices(8);
- *   
+ *
  *   // compute indices of nonzero elements
  *   typedef thrust::device_vector<int>::iterator IndexIterator;
- *   
+ *
  *   // use make_counting_iterator to define the sequence [0, 8)
  *   IndexIterator indices_end = thrust::copy_if(thrust::make_counting_iterator(0),
  *                                               thrust::make_counting_iterator(8),
@@ -118,7 +117,7 @@ namespace thrust
  *                                               indices.begin(),
  *                                               thrust::identity<int>());
  *   // indices now contains [1,2,5,7]
- *   
+ *
  *   return 0;
  *  }
  *  \endcode
@@ -145,11 +144,11 @@ template<typename Incrementable,
     /*! \endcond
      */
 
-    /*! Null constructor initializes this \p counting_iterator's \c Incrementable
-     *  counter using its null constructor.
+    /*! Default constructor initializes this \p counting_iterator's counter to
+     * `Incrementable{}`.
      */
     __host__ __device__
-    counting_iterator() {}
+    counting_iterator() : super_t(Incrementable{}) {}
 
     /*! Copy constructor copies the value of another \p counting_iterator into a
      *  new \p counting_iterator.
@@ -159,7 +158,7 @@ template<typename Incrementable,
     __host__ __device__
     counting_iterator(counting_iterator const &rhs):super_t(rhs.base()){}
 
-    /*! Copy constructor copies the value of another counting_iterator 
+    /*! Copy constructor copies the value of another counting_iterator
      *  with related System type.
      *
      *  \param rhs The \p counting_iterator to copy.
@@ -175,13 +174,17 @@ template<typename Incrementable,
 
     /*! This \c explicit constructor copies the value of an \c Incrementable
      *  into a new \p counting_iterator's \c Incrementable counter.
-     *  
+     *
      *  \param x The initial value of the new \p counting_iterator's \c Incrementable
      *         counter.
      */
     __host__ __device__
     explicit counting_iterator(Incrementable x):super_t(x){}
 
+#if THRUST_CPP_DIALECT >= 2011
+    counting_iterator & operator=(const counting_iterator &) = default;
+#endif
+
     /*! \cond
      */
   private:
@@ -239,5 +242,5 @@ counting_iterator<Incrementable> make_counting_iterator(Incrementable x)
 /*! \} // end iterators
  */
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/iterator/detail/any_assign.h b/thrust/iterator/detail/any_assign.h
index 4e7f2cf20..87192215c 100644
--- a/thrust/iterator/detail/any_assign.h
+++ b/thrust/iterator/detail/any_assign.h
@@ -18,8 +18,7 @@
 
 #include <thrust/detail/config.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 
@@ -51,5 +50,5 @@ struct any_assign
 
 
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/iterator/detail/any_system_tag.h b/thrust/iterator/detail/any_system_tag.h
index c49d88d1f..2c5ce6448 100644
--- a/thrust/iterator/detail/any_system_tag.h
+++ b/thrust/iterator/detail/any_system_tag.h
@@ -19,8 +19,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/detail/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 struct any_system_tag
   : thrust::execution_policy<any_system_tag>
@@ -30,8 +29,5 @@ struct any_system_tag
   template<typename T> operator T () const {return T();}
 };
 
-// TODO remove this in 1.7.0
-typedef THRUST_DEPRECATED any_system_tag any_space_tag;
-
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/iterator/detail/constant_iterator_base.h b/thrust/iterator/detail/constant_iterator_base.h
index 56b1cc4f4..56bb7a5d0 100644
--- a/thrust/iterator/detail/constant_iterator_base.h
+++ b/thrust/iterator/detail/constant_iterator_base.h
@@ -16,11 +16,12 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/iterator_adaptor.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 // forward declaration of constant_iterator
 template<typename,typename,typename> class constant_iterator;
@@ -66,5 +67,5 @@ template<typename Value,
 
 } // end detail
   
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/iterator/detail/counting_iterator.inl b/thrust/iterator/detail/counting_iterator.inl
index abcd87989..ee4a9df15 100644
--- a/thrust/iterator/detail/counting_iterator.inl
+++ b/thrust/iterator/detail/counting_iterator.inl
@@ -16,14 +16,15 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/detail/numeric_traits.h>
 #include <thrust/detail/type_traits.h>
 #include <cstddef>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 // forward declaration of counting_iterator
 template <typename Incrementable, typename System, typename Traversal, typename Difference>
@@ -137,5 +138,5 @@ template<typename Difference, typename Incrementable1, typename Incrementable2>
 
 
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/iterator/detail/device_system_tag.h b/thrust/iterator/detail/device_system_tag.h
index 394b991cd..b86109d21 100644
--- a/thrust/iterator/detail/device_system_tag.h
+++ b/thrust/iterator/detail/device_system_tag.h
@@ -23,18 +23,8 @@
 #include __THRUST_DEVICE_SYSTEM_TAG_HEADER
 #undef __THRUST_DEVICE_SYSTEM_TAG_HEADER
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 typedef thrust::system::__THRUST_DEVICE_SYSTEM_NAMESPACE::tag device_system_tag;
 
-} // end thrust
-
-// TODO remove this in 1.8.0
-namespace thrust
-{
-
-typedef THRUST_DEPRECATED device_system_tag device_space_tag;
-
-} // end thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/iterator/detail/discard_iterator_base.h b/thrust/iterator/detail/discard_iterator_base.h
index a4a8c312b..38f77b378 100644
--- a/thrust/iterator/detail/discard_iterator_base.h
+++ b/thrust/iterator/detail/discard_iterator_base.h
@@ -22,8 +22,7 @@
 #include <thrust/iterator/detail/any_assign.h>
 #include <cstddef> // for std::ptrdiff_t
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 // forward declaration of discard_iterator
 template<typename> class discard_iterator;
@@ -60,6 +59,6 @@ template<typename System>
 
 } // end detail
   
-} // end thrust
+THRUST_NAMESPACE_END
 
 
diff --git a/thrust/iterator/detail/distance_from_result.h b/thrust/iterator/detail/distance_from_result.h
index 2b7e0d60e..fe140344d 100644
--- a/thrust/iterator/detail/distance_from_result.h
+++ b/thrust/iterator/detail/distance_from_result.h
@@ -19,8 +19,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/detail/type_traits.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace detail
 {
@@ -38,5 +37,5 @@ template<typename IteratorFacade1, typename IteratorFacade2>
 
 } // end detail
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/iterator/detail/host_system_tag.h b/thrust/iterator/detail/host_system_tag.h
index a487e6ac5..58478f8d9 100644
--- a/thrust/iterator/detail/host_system_tag.h
+++ b/thrust/iterator/detail/host_system_tag.h
@@ -23,18 +23,8 @@
 #include __THRUST_HOST_SYSTEM_TAG_HEADER
 #undef __THRUST_HOST_SYSTEM_TAG_HEADER
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 typedef thrust::system::__THRUST_HOST_SYSTEM_NAMESPACE::tag host_system_tag;
 
-} // end thrust
-
-// TODO remove this in 1.8.0
-namespace thrust
-{
-
-typedef THRUST_DEPRECATED host_system_tag host_space_tag;
-
-} // end thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/iterator/detail/is_iterator_category.h b/thrust/iterator/detail/is_iterator_category.h
index b538358be..e520452a3 100644
--- a/thrust/iterator/detail/is_iterator_category.h
+++ b/thrust/iterator/detail/is_iterator_category.h
@@ -20,8 +20,7 @@
 #include <thrust/iterator/iterator_categories.h>
 #include <thrust/detail/type_traits.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace detail
 {
@@ -56,5 +55,5 @@ template <typename T>
 
 } // end detail
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/iterator/detail/iterator_adaptor_base.h b/thrust/iterator/detail/iterator_adaptor_base.h
index d9dbfaae6..1173e414c 100644
--- a/thrust/iterator/detail/iterator_adaptor_base.h
+++ b/thrust/iterator/detail/iterator_adaptor_base.h
@@ -16,13 +16,14 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/detail/type_traits.h>
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/detail/use_default.h>
 #include <thrust/iterator/iterator_facade.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 
 // forward declaration of iterator_adaptor for iterator_adaptor_base below
@@ -107,5 +108,5 @@ template<typename Derived,
 
 
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/iterator/detail/iterator_category_to_system.h b/thrust/iterator/detail/iterator_category_to_system.h
index fd378fae7..e6103b539 100644
--- a/thrust/iterator/detail/iterator_category_to_system.h
+++ b/thrust/iterator/detail/iterator_category_to_system.h
@@ -24,8 +24,7 @@
 #include <thrust/iterator/detail/any_system_tag.h>
 #include <thrust/detail/type_traits.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace detail
 {
@@ -76,5 +75,5 @@ template<typename CategoryOrTraversal>
 }; // end iterator_category_or_traversal_to_system
 
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/iterator/detail/iterator_category_to_traversal.h b/thrust/iterator/detail/iterator_category_to_traversal.h
index d520e9deb..d8c736c50 100644
--- a/thrust/iterator/detail/iterator_category_to_traversal.h
+++ b/thrust/iterator/detail/iterator_category_to_traversal.h
@@ -22,8 +22,7 @@
 #include <thrust/iterator/detail/iterator_category_to_system.h>
 #include <thrust/detail/type_traits.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace detail
 {
@@ -32,9 +31,6 @@ namespace detail
 template <typename> struct is_iterator_system;
 template <typename> struct is_iterator_traversal;
 
-// make type_traits easy to access
-using namespace thrust::detail;
-
 template <typename Category>
   struct host_system_category_to_traversal
     : eval_if<
@@ -52,7 +48,7 @@ template <typename Category>
               eval_if<
                 is_convertible<Category, output_host_iterator_tag>::value,
                 detail::identity_<incrementable_traversal_tag>,
-                void
+                detail::identity_<void>
               >
             >
           >
@@ -80,7 +76,7 @@ template <typename Category>
               eval_if<
                 is_convertible<Category, output_device_iterator_tag>::value,
                 detail::identity_<incrementable_traversal_tag>,
-                void
+                detail::identity_<void>
               >
             >
           >
@@ -111,7 +107,7 @@ template<typename Category>
           device_system_category_to_traversal<Category>,
 
           // unknown category
-          void
+          detail::identity_<void>
         >
       >
 {};
@@ -130,5 +126,5 @@ template <typename CategoryOrTraversal>
 
 } // end detail
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/iterator/detail/iterator_category_with_system_and_traversal.h b/thrust/iterator/detail/iterator_category_with_system_and_traversal.h
index 8f5374b16..cdd8a6d36 100644
--- a/thrust/iterator/detail/iterator_category_with_system_and_traversal.h
+++ b/thrust/iterator/detail/iterator_category_with_system_and_traversal.h
@@ -18,8 +18,7 @@
 
 #include <thrust/detail/config.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 
@@ -53,5 +52,5 @@ template<typename Category, typename System, typename Traversal>
 
 
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/iterator/detail/iterator_facade_category.h b/thrust/iterator/detail/iterator_facade_category.h
index e00d3ef05..81b518002 100644
--- a/thrust/iterator/detail/iterator_facade_category.h
+++ b/thrust/iterator/detail/iterator_facade_category.h
@@ -27,8 +27,7 @@
 #include <thrust/iterator/detail/iterator_category_with_system_and_traversal.h>
 #include <thrust/iterator/detail/iterator_category_to_traversal.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace detail
 {
@@ -249,5 +248,5 @@ template<typename CategoryOrSystem,
 
 
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/iterator/detail/iterator_traits.inl b/thrust/iterator/detail/iterator_traits.inl
index 8a9cc4ffb..544c24f0b 100644
--- a/thrust/iterator/detail/iterator_traits.inl
+++ b/thrust/iterator/detail/iterator_traits.inl
@@ -14,18 +14,16 @@
  *  limitations under the License.
  */
 
+#pragma once
 
-/*! \file iterator_traits.inl
- *  \brief Inline file for iterator_traits.h.
- */
+#include <thrust/detail/config.h>
 
 #include <thrust/iterator/iterator_categories.h>
 #include <thrust/iterator/detail/iterator_category_to_traversal.h>
 #include <thrust/detail/type_traits.h>
 #include <thrust/type_traits/void_t.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 template<typename Iterator>
   struct iterator_value
@@ -33,6 +31,8 @@ template<typename Iterator>
   typedef typename thrust::iterator_traits<Iterator>::value_type type;
 }; // end iterator_value
 
+template <typename Iterator>
+using iterator_value_t = typename iterator_value<Iterator>::type;
 
 template<typename Iterator>
   struct iterator_pointer
@@ -40,6 +40,8 @@ template<typename Iterator>
   typedef typename thrust::iterator_traits<Iterator>::pointer type;
 }; // end iterator_pointer
 
+template <typename Iterator>
+using iterator_pointer_t = typename iterator_pointer<Iterator>::type;
 
 template<typename Iterator>
   struct iterator_reference
@@ -47,6 +49,8 @@ template<typename Iterator>
   typedef typename iterator_traits<Iterator>::reference type;
 }; // end iterator_reference
 
+template <typename Iterator>
+using iterator_reference_t = typename iterator_reference<Iterator>::type;
 
 template<typename Iterator>
   struct iterator_difference
@@ -54,6 +58,9 @@ template<typename Iterator>
   typedef typename thrust::iterator_traits<Iterator>::difference_type type;
 }; // end iterator_difference
 
+template <typename Iterator>
+using iterator_difference_t = typename iterator_difference<Iterator>::type;
+
 namespace detail
 {
 
@@ -70,7 +77,7 @@ struct iterator_system_impl<
   : detail::iterator_category_to_system<
       typename iterator_traits<Iterator>::iterator_category
     >
-{}; 
+{};
 
 } // namespace detail
 
@@ -90,6 +97,8 @@ template<>
   typedef thrust::iterator_system<const int*>::type type;
 }; // end iterator_system<void*>
 
+template <typename Iterator>
+using iterator_system_t = typename iterator_system<Iterator>::type;
 
 template <typename Iterator>
   struct iterator_traversal
@@ -123,5 +132,5 @@ template<typename T>
 
 
 } // end namespace detail
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/iterator/detail/iterator_traversal_tags.h b/thrust/iterator/detail/iterator_traversal_tags.h
index 73cd1f76a..1fbc8a1e4 100644
--- a/thrust/iterator/detail/iterator_traversal_tags.h
+++ b/thrust/iterator/detail/iterator_traversal_tags.h
@@ -16,8 +16,9 @@
 
 #pragma once
 
-namespace thrust
-{
+#include <thrust/detail/config.h>
+
+THRUST_NAMESPACE_BEGIN
 
 // define Boost's traversal tags
 struct no_traversal_tag {};
@@ -37,5 +38,5 @@ struct bidirectional_traversal_tag
 struct random_access_traversal_tag
   : bidirectional_traversal_tag {};
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/iterator/detail/join_iterator.h b/thrust/iterator/detail/join_iterator.h
index 1ab99ce47..83f143dc0 100644
--- a/thrust/iterator/detail/join_iterator.h
+++ b/thrust/iterator/detail/join_iterator.h
@@ -23,8 +23,7 @@
 #include <thrust/detail/type_traits.h>
 
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 
@@ -130,5 +129,5 @@ join_iterator<RandomAccessIterator1,RandomAccessIterator2,Size> make_join_iterat
 
 
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/iterator/detail/minimum_category.h b/thrust/iterator/detail/minimum_category.h
index abb80d8c1..01e7e82c5 100644
--- a/thrust/iterator/detail/minimum_category.h
+++ b/thrust/iterator/detail/minimum_category.h
@@ -16,10 +16,11 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/detail/type_traits/minimum_type.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace detail
 { 
@@ -47,6 +48,6 @@ template<typename T1,
 
 } // end detail
 
-} // end thrust
+THRUST_NAMESPACE_END
 
 
diff --git a/thrust/iterator/detail/minimum_system.h b/thrust/iterator/detail/minimum_system.h
index 45b5a592f..dcb29ccd2 100644
--- a/thrust/iterator/detail/minimum_system.h
+++ b/thrust/iterator/detail/minimum_system.h
@@ -21,8 +21,7 @@
 #include <thrust/detail/type_traits/is_metafunction_defined.h>
 #include <thrust/detail/type_traits/minimum_type.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 { 
 
@@ -78,5 +77,5 @@ template<typename T1,
 
 
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/iterator/detail/normal_iterator.h b/thrust/iterator/detail/normal_iterator.h
index 0f6e1660e..eb5d33604 100644
--- a/thrust/iterator/detail/normal_iterator.h
+++ b/thrust/iterator/detail/normal_iterator.h
@@ -22,12 +22,13 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/iterator/iterator_adaptor.h>
 #include <thrust/detail/type_traits.h>
 #include <thrust/type_traits/is_contiguous_iterator.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 
@@ -74,5 +75,5 @@ struct proclaim_contiguous_iterator<
   thrust::detail::normal_iterator<T>
 > : true_type {};
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/iterator/detail/permutation_iterator_base.h b/thrust/iterator/detail/permutation_iterator_base.h
index 2610cfdfa..d586cabb7 100644
--- a/thrust/iterator/detail/permutation_iterator_base.h
+++ b/thrust/iterator/detail/permutation_iterator_base.h
@@ -16,13 +16,14 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/iterator/iterator_adaptor.h>
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/detail/type_traits.h>
 #include <thrust/iterator/detail/minimum_system.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 template<typename,typename> class permutation_iterator;
 
@@ -49,5 +50,5 @@ template<typename ElementIterator,
 
 } // end detail
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/iterator/detail/retag.h b/thrust/iterator/detail/retag.h
index a512d3640..d277d8b6f 100644
--- a/thrust/iterator/detail/retag.h
+++ b/thrust/iterator/detail/retag.h
@@ -21,8 +21,7 @@
 #include <thrust/iterator/detail/tagged_iterator.h>
 #include <thrust/detail/pointer.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 
@@ -144,5 +143,5 @@ __host__ __device__
 } // end retag()
 
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/iterator/detail/reverse_iterator.inl b/thrust/iterator/detail/reverse_iterator.inl
index 5eb9ac5ff..9182ac3e8 100644
--- a/thrust/iterator/detail/reverse_iterator.inl
+++ b/thrust/iterator/detail/reverse_iterator.inl
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2021 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -14,11 +14,14 @@
  *  limitations under the License.
  */
 
+#pragma once
+
+#include <thrust/detail/config.h>
+
 #include <thrust/iterator/reverse_iterator.h>
 #include <thrust/iterator/iterator_traits.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace detail
 {
@@ -47,14 +50,14 @@ template<typename BidirectionalIterator>
     reverse_iterator<BidirectionalIterator>
       ::reverse_iterator(reverse_iterator<OtherBidirectionalIterator> const &r
 // XXX msvc screws this up
-#ifndef _MSC_VER
+#if THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC
                      , typename thrust::detail::enable_if<
                          thrust::detail::is_convertible<
                            OtherBidirectionalIterator,
                            BidirectionalIterator
                          >::value
                        >::type *
-#endif // _MSC_VER
+#endif // MSVC
                      )
         :super_t(r.base())
 {
@@ -111,5 +114,5 @@ reverse_iterator<BidirectionalIterator> make_reverse_iterator(BidirectionalItera
 } // end make_reverse_iterator()
 
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/iterator/detail/reverse_iterator_base.h b/thrust/iterator/detail/reverse_iterator_base.h
index 68fa1f2f8..de3bafde9 100644
--- a/thrust/iterator/detail/reverse_iterator_base.h
+++ b/thrust/iterator/detail/reverse_iterator_base.h
@@ -16,11 +16,12 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/iterator/iterator_adaptor.h>
 #include <thrust/iterator/iterator_traits.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 template <typename> class reverse_iterator;
 
@@ -38,5 +39,5 @@ template<typename BidirectionalIterator>
 
 } // end detail
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/iterator/detail/tagged_iterator.h b/thrust/iterator/detail/tagged_iterator.h
index 125a4675e..24cbbb736 100644
--- a/thrust/iterator/detail/tagged_iterator.h
+++ b/thrust/iterator/detail/tagged_iterator.h
@@ -22,8 +22,7 @@
 #include <thrust/detail/use_default.h>
 #include <thrust/type_traits/is_contiguous_iterator.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 
@@ -59,6 +58,21 @@ template<typename Iterator, typename Tag>
       : super_t(x) {}
 }; // end tagged_iterator
 
+/*! \p make_tagged_iterator creates a \p tagged_iterator
+ *  from a \c Iterator with system tag \c Tag.
+ *
+ *  \tparam Tag Any system tag.
+ *  \tparam Iterator Any iterator type.
+ *  \param iter The iterator of interest.
+ *  \return An iterator whose system tag is \p Tag and whose behavior is otherwise
+ *          equivalent to \p iter.
+ */
+template <typename Tag, typename Iterator>
+inline auto make_tagged_iterator(Iterator iter) -> tagged_iterator<Iterator, Tag>
+{
+  return tagged_iterator<Iterator, Tag>(iter);
+}
+
 } // end detail
 
 // tagged_iterator is trivial if its base iterator is.
@@ -67,5 +81,5 @@ struct proclaim_contiguous_iterator<
   detail::tagged_iterator<BaseIterator, Tag>
 > : is_contiguous_iterator<BaseIterator> {};
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/iterator/detail/transform_input_output_iterator.inl b/thrust/iterator/detail/transform_input_output_iterator.inl
new file mode 100644
index 000000000..b4792f724
--- /dev/null
+++ b/thrust/iterator/detail/transform_input_output_iterator.inl
@@ -0,0 +1,107 @@
+/*
+ *  Copyright 2020-2021 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/iterator/iterator_adaptor.h>
+#include <thrust/detail/type_traits.h>
+
+THRUST_NAMESPACE_BEGIN
+
+template <typename InputFunction, typename OutputFunction, typename Iterator>
+  class transform_input_output_iterator;
+
+namespace detail
+{
+
+// Proxy reference that invokes InputFunction when reading from and
+// OutputFunction when writing to the dereferenced iterator
+template <typename InputFunction, typename OutputFunction, typename Iterator>
+  class transform_input_output_iterator_proxy
+{
+  using iterator_value_type = typename thrust::iterator_value<Iterator>::type;
+
+  using Value = invoke_result_t<InputFunction, iterator_value_type>;
+
+  public:
+    __host__ __device__
+    transform_input_output_iterator_proxy(const Iterator& io, InputFunction input_function, OutputFunction output_function)
+      : io(io), input_function(input_function), output_function(output_function)
+    {
+    }
+
+    transform_input_output_iterator_proxy(const transform_input_output_iterator_proxy&) = default;
+
+    __thrust_exec_check_disable__
+    __host__ __device__
+    operator Value const() const
+    {
+      return input_function(*io);
+    }
+
+    __thrust_exec_check_disable__
+    template <typename T>
+    __host__ __device__
+    transform_input_output_iterator_proxy operator=(const T& x)
+    {
+      *io = output_function(x);
+      return *this;
+    }
+
+    __thrust_exec_check_disable__
+    __host__ __device__
+    transform_input_output_iterator_proxy operator=(const transform_input_output_iterator_proxy& x)
+    {
+      *io = output_function(x);
+      return *this;
+    }
+
+  private:
+    Iterator io;
+    InputFunction input_function;
+    OutputFunction output_function;
+};
+
+// Compute the iterator_adaptor instantiation to be used for transform_input_output_iterator
+template <typename InputFunction, typename OutputFunction, typename Iterator>
+struct transform_input_output_iterator_base
+{
+private:
+  using iterator_value_type = typename thrust::iterator_value<Iterator>::type;
+
+public:
+    typedef thrust::iterator_adaptor
+    <
+        transform_input_output_iterator<InputFunction, OutputFunction, Iterator>
+      , Iterator
+      , detail::invoke_result_t<InputFunction, iterator_value_type>
+      , thrust::use_default
+      , thrust::use_default
+      , transform_input_output_iterator_proxy<InputFunction, OutputFunction, Iterator>
+    > type;
+};
+
+// Register transform_input_output_iterator_proxy with 'is_proxy_reference' from
+// type_traits to enable its use with algorithms.
+template <typename InputFunction, typename OutputFunction, typename Iterator>
+struct is_proxy_reference<
+    transform_input_output_iterator_proxy<InputFunction, OutputFunction, Iterator> >
+    : public thrust::detail::true_type {};
+
+} // end detail
+THRUST_NAMESPACE_END
+
diff --git a/thrust/iterator/detail/transform_iterator.inl b/thrust/iterator/detail/transform_iterator.inl
index 65eee8687..0dc6f9854 100644
--- a/thrust/iterator/detail/transform_iterator.inl
+++ b/thrust/iterator/detail/transform_iterator.inl
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2021 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -14,19 +14,22 @@
  *  limitations under the License.
  */
 
-#include <thrust/iterator/transform_iterator.h>
-#include <thrust/iterator/iterator_adaptor.h>
-#include <thrust/iterator/iterator_traits.h>
+#pragma once
+
+#include <thrust/detail/config.h>
 #include <thrust/detail/type_traits.h>
 #include <thrust/detail/type_traits/result_of_adaptable_function.h>
+#include <thrust/iterator/iterator_adaptor.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/type_traits/remove_cvref.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 template <class UnaryFunction, class Iterator, class Reference, class Value>
   class transform_iterator;
-  
-namespace detail 
+
+namespace detail
 {
 
 // Compute the iterator_adaptor instantiation to be used for transform_iterator
@@ -40,22 +43,16 @@ struct transform_iterator_base
       thrust::detail::result_of_adaptable_function<UnaryFunc(typename thrust::iterator_value<Iterator>::type)>
     >::type reference;
 
-    // To get the default for Value: remove any reference on the
-    // result type, but retain any constness to signal
-    // non-writability.  Note that if we adopt Thomas' suggestion
-    // to key non-writability *only* on the Reference argument,
-    // we'd need to strip constness here as well.
-    typedef typename thrust::detail::ia_dflt_help<
-      Value,
-      thrust::detail::remove_reference<reference>
-    >::type cv_value_type;
+    // To get the default for Value: remove cvref on the result type.
+    using value_type =
+      typename thrust::detail::ia_dflt_help<Value, thrust::remove_cvref<reference>>::type;
 
- public:
+  public:
     typedef thrust::iterator_adaptor
     <
         transform_iterator<UnaryFunc, Iterator, Reference, Value>
       , Iterator
-      , cv_value_type
+      , value_type
       , thrust::use_default   // Leave the system alone
         //, thrust::use_default   // Leave the traversal alone
         // use the Iterator's category to let any system iterators remain random access even though
@@ -68,5 +65,5 @@ struct transform_iterator_base
 
 
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/iterator/detail/transform_output_iterator.inl b/thrust/iterator/detail/transform_output_iterator.inl
index 85265a4e6..d5033f105 100644
--- a/thrust/iterator/detail/transform_output_iterator.inl
+++ b/thrust/iterator/detail/transform_output_iterator.inl
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2016 NVIDIA Corporation
+ *  Copyright 2008-2021 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -14,19 +14,22 @@
  *  limitations under the License.
  */
 
+#pragma once
+
+#include <thrust/detail/config.h>
+
 #include <thrust/iterator/transform_output_iterator.h>
 #include <thrust/iterator/iterator_adaptor.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
-template <typename OutputIterator, typename UnaryFunction>
+template <typename UnaryFunction, typename OutputIterator>
   class transform_output_iterator;
 
-namespace detail 
+namespace detail
 {
 
-// Proxy reference that uses Unary Functiont o transform the rhs of assigment
+// Proxy reference that uses Unary Function to transform the rhs of assigment
 // operator before writing the result to OutputIterator
 template <typename UnaryFunction, typename OutputIterator>
   class transform_output_iterator_proxy
@@ -66,13 +69,13 @@ struct transform_output_iterator_base
     > type;
 };
 
-// Register trasnform_output_iterator_proxy with 'is_proxy_reference' from
+// Register transform_output_iterator_proxy with 'is_proxy_reference' from
 // type_traits to enable its use with algorithms.
-template <class OutputIterator, class UnaryFunction>
+template <class UnaryFunction, class OutputIterator>
 struct is_proxy_reference<
-    transform_output_iterator_proxy<OutputIterator, UnaryFunction> >
+    transform_output_iterator_proxy<UnaryFunction, OutputIterator> >
     : public thrust::detail::true_type {};
 
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/iterator/detail/tuple_of_iterator_references.h b/thrust/iterator/detail/tuple_of_iterator_references.h
index 93d7e05e4..78c5e8a28 100644
--- a/thrust/iterator/detail/tuple_of_iterator_references.h
+++ b/thrust/iterator/detail/tuple_of_iterator_references.h
@@ -21,23 +21,19 @@
 #include <thrust/pair.h>
 #include <thrust/detail/reference_forward_declaration.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 
   
 template<
-  typename T0, typename T1, typename T2,
-  typename T3, typename T4, typename T5,
-  typename T6, typename T7, typename T8,
-  typename T9
+  typename... Ts
 >
   class tuple_of_iterator_references
-    : public thrust::tuple<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9>
+    : public thrust::tuple<Ts...>
 {
   private:
-    typedef thrust::tuple<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9> super_t;
+    typedef thrust::tuple<Ts...> super_t;
 
   public:
     // allow implicit construction from tuple<refs>
@@ -49,9 +45,9 @@ template<
     // allow assignment from tuples
     // XXX might be worthwhile to guard this with an enable_if is_assignable
     __thrust_exec_check_disable__
-    template<typename U1, typename U2>
+    template<typename... Us>
     inline __host__ __device__
-    tuple_of_iterator_references &operator=(const detail::cons<U1,U2> &other)
+    tuple_of_iterator_references &operator=(const thrust::tuple<Us...> &other)
     {
       super_t::operator=(other);
       return *this;
@@ -72,24 +68,21 @@ template<
     // XXX perhaps we should generalize to reference<T>
     //     we could captures reference<pair> this way
     __thrust_exec_check_disable__
-    template<typename U0, typename U1, typename U2,
-             typename U3, typename U4, typename U5,
-             typename U6, typename U7, typename U8,
-             typename U9,
-             typename Pointer, typename Derived>
+    template<typename Pointer, typename Derived,
+             typename... Us>
     inline __host__ __device__
 // XXX gcc-4.2 crashes on is_assignable
 //    typename thrust::detail::enable_if<
 //      thrust::detail::is_assignable<
 //        super_t,
-//        const thrust::tuple<U0,U1,U2,U3,U4,U5,U6,U7,U8,U9>
+//        const thrust::tuple<Us...>
 //      >::value,
 //      tuple_of_iterator_references &
 //    >::type
     tuple_of_iterator_references &
-    operator=(const thrust::reference<thrust::tuple<U0,U1,U2,U3,U4,U5,U6,U7,U8,U9>, Pointer, Derived> &other)
+    operator=(const thrust::reference<thrust::tuple<Us...>, Pointer, Derived> &other)
     {
-      typedef thrust::tuple<U0,U1,U2,U3,U4,U5,U6,U7,U8,U9> tuple_type;
+      typedef thrust::tuple<Us...> tuple_type;
 
       // XXX perhaps this could be accelerated
       tuple_type other_tuple = other;
@@ -102,144 +95,9 @@ template<
     inline __host__ __device__
     tuple_of_iterator_references() {}
 
-    inline __host__ __device__ 
-    tuple_of_iterator_references(typename access_traits<T0>::parameter_type t0)
-      : super_t(t0,
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()))
-    {}
-
-    inline __host__ __device__ 
-    tuple_of_iterator_references(typename access_traits<T0>::parameter_type t0,
-                                 typename access_traits<T1>::parameter_type t1)
-      : super_t(t0, t1,
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()))
-    {}
-
-    inline __host__ __device__ 
-    tuple_of_iterator_references(typename access_traits<T0>::parameter_type t0,
-                                 typename access_traits<T1>::parameter_type t1,
-                                 typename access_traits<T2>::parameter_type t2)
-      : super_t(t0, t1, t2,
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()))
-    {}
-
-    inline __host__ __device__ 
-    tuple_of_iterator_references(typename access_traits<T0>::parameter_type t0,
-                                 typename access_traits<T1>::parameter_type t1,
-                                 typename access_traits<T2>::parameter_type t2,
-                                 typename access_traits<T3>::parameter_type t3)
-      : super_t(t0, t1, t2, t3,
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()))
-    {}
-
-    inline __host__ __device__ 
-    tuple_of_iterator_references(typename access_traits<T0>::parameter_type t0,
-                                 typename access_traits<T1>::parameter_type t1,
-                                 typename access_traits<T2>::parameter_type t2,
-                                 typename access_traits<T3>::parameter_type t3,
-                                 typename access_traits<T4>::parameter_type t4)
-      : super_t(t0, t1, t2, t3, t4,
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()))
-    {}
-
-    inline __host__ __device__ 
-    tuple_of_iterator_references(typename access_traits<T0>::parameter_type t0,
-                                 typename access_traits<T1>::parameter_type t1,
-                                 typename access_traits<T2>::parameter_type t2,
-                                 typename access_traits<T3>::parameter_type t3,
-                                 typename access_traits<T4>::parameter_type t4,
-                                 typename access_traits<T5>::parameter_type t5)
-      : super_t(t0, t1, t2, t3, t4, t5,
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()))
-    {}
-
-    inline __host__ __device__ 
-    tuple_of_iterator_references(typename access_traits<T0>::parameter_type t0,
-                                 typename access_traits<T1>::parameter_type t1,
-                                 typename access_traits<T2>::parameter_type t2,
-                                 typename access_traits<T3>::parameter_type t3,
-                                 typename access_traits<T4>::parameter_type t4,
-                                 typename access_traits<T5>::parameter_type t5,
-                                 typename access_traits<T6>::parameter_type t6)
-      : super_t(t0, t1, t2, t3, t4, t5, t6,
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()))
-    {}
-
-    inline __host__ __device__ 
-    tuple_of_iterator_references(typename access_traits<T0>::parameter_type t0,
-                                 typename access_traits<T1>::parameter_type t1,
-                                 typename access_traits<T2>::parameter_type t2,
-                                 typename access_traits<T3>::parameter_type t3,
-                                 typename access_traits<T4>::parameter_type t4,
-                                 typename access_traits<T5>::parameter_type t5,
-                                 typename access_traits<T6>::parameter_type t6,
-                                 typename access_traits<T7>::parameter_type t7)
-      : super_t(t0, t1, t2, t3, t4, t5, t6, t7,
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()))
-    {}
-
-    inline __host__ __device__ 
-    tuple_of_iterator_references(typename access_traits<T0>::parameter_type t0,
-                                 typename access_traits<T1>::parameter_type t1,
-                                 typename access_traits<T2>::parameter_type t2,
-                                 typename access_traits<T3>::parameter_type t3,
-                                 typename access_traits<T4>::parameter_type t4,
-                                 typename access_traits<T5>::parameter_type t5,
-                                 typename access_traits<T6>::parameter_type t6,
-                                 typename access_traits<T7>::parameter_type t7,
-                                 typename access_traits<T8>::parameter_type t8)
-      : super_t(t0, t1, t2, t3, t4, t5, t6, t7, t8,
-                static_cast<const null_type&>(null_type()))
-    {}
-
-    inline __host__ __device__ 
-    tuple_of_iterator_references(typename access_traits<T0>::parameter_type t0,
-                                 typename access_traits<T1>::parameter_type t1,
-                                 typename access_traits<T2>::parameter_type t2,
-                                 typename access_traits<T3>::parameter_type t3,
-                                 typename access_traits<T4>::parameter_type t4,
-                                 typename access_traits<T5>::parameter_type t5,
-                                 typename access_traits<T6>::parameter_type t6,
-                                 typename access_traits<T7>::parameter_type t7,
-                                 typename access_traits<T8>::parameter_type t8,
-                                 typename access_traits<T9>::parameter_type t9)
-      : super_t(t0, t1, t2, t3, t4, t5, t6, t7, t8, t9)
+    inline __host__ __device__
+    tuple_of_iterator_references(typename access_traits<Ts>::parameter_type... ts)
+      : super_t(ts...)
     {}
 };
 
@@ -247,17 +105,42 @@ template<
 // this overload of swap() permits swapping tuple_of_iterator_references returned as temporaries from
 // iterator dereferences
 template<
-  typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8, typename T9,
-  typename U0, typename U1, typename U2, typename U3, typename U4, typename U5, typename U6, typename U7, typename U8, typename U9
+  typename... Ts,
+  typename... Us
 >
 inline __host__ __device__
-void swap(tuple_of_iterator_references<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9> x,
-          tuple_of_iterator_references<U0,U1,U2,U3,U4,U5,U6,U7,U8,U9> y)
+void swap(tuple_of_iterator_references<Ts...> x,
+          tuple_of_iterator_references<Us...> y)
 {
   x.swap(y);
 }
 
 
 } // end detail
-} // end thrust
+
+// define tuple_size, tuple_element, etc.
+template<class... Ts>
+struct tuple_size<detail::tuple_of_iterator_references<Ts...>>
+  : std::integral_constant<size_t, sizeof...(Ts)>
+{};
+
+template<size_t i>
+struct tuple_element<i, detail::tuple_of_iterator_references<>> {};
+
+
+template<class T, class... Ts>
+struct tuple_element<0, detail::tuple_of_iterator_references<T,Ts...>>
+{
+  using type = T;
+};
+
+
+template<size_t i, class T, class... Ts>
+struct tuple_element<i, detail::tuple_of_iterator_references<T,Ts...>>
+{
+  using type = typename tuple_element<i - 1, detail::tuple_of_iterator_references<Ts...>>::type;
+};
+
+
+THRUST_NAMESPACE_END
 
diff --git a/thrust/iterator/detail/universal_categories.h b/thrust/iterator/detail/universal_categories.h
index 2389796b1..d2abd7f55 100644
--- a/thrust/iterator/detail/universal_categories.h
+++ b/thrust/iterator/detail/universal_categories.h
@@ -21,8 +21,7 @@
 
 // XXX eliminate this file
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 // define these types without inheritance to avoid ambiguous conversion to base classes
 
@@ -83,5 +82,5 @@ struct random_access_universal_iterator_tag
 };
 
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/iterator/detail/zip_iterator.inl b/thrust/iterator/detail/zip_iterator.inl
index 7eb35b091..a2bc98afe 100644
--- a/thrust/iterator/detail/zip_iterator.inl
+++ b/thrust/iterator/detail/zip_iterator.inl
@@ -16,11 +16,12 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/detail/tuple_transform.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 
 template<typename IteratorTuple>
@@ -131,13 +132,21 @@ template<typename IteratorTuple>
 } // end zip_iterator::distance_to()
 
 
-template<typename IteratorTuple>
+template<typename... Iterators>
+__host__ __device__
+  zip_iterator<thrust::tuple<Iterators...>> make_zip_iterator(thrust::tuple<Iterators...> t)
+{
+  return zip_iterator<thrust::tuple<Iterators...>>(t);
+} // end make_zip_iterator()
+
+
+template<typename... Iterators>
 __host__ __device__
-  zip_iterator<IteratorTuple> make_zip_iterator(IteratorTuple t)
+  zip_iterator<thrust::tuple<Iterators...>> make_zip_iterator(Iterators... its)
 {
-  return zip_iterator<IteratorTuple>(t);
+  return make_zip_iterator(thrust::make_tuple(its...));
 } // end make_zip_iterator()
 
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/iterator/detail/zip_iterator_base.h b/thrust/iterator/detail/zip_iterator_base.h
index e0d941c8f..030153b65 100644
--- a/thrust/iterator/detail/zip_iterator_base.h
+++ b/thrust/iterator/detail/zip_iterator_base.h
@@ -16,19 +16,22 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
+#include <thrust/advance.h>
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/iterator/iterator_facade.h>
 #include <thrust/iterator/iterator_categories.h>
 #include <thrust/iterator/detail/minimum_category.h>
 #include <thrust/iterator/detail/minimum_system.h>
+#include <thrust/type_traits/integer_sequence.h>
 #include <thrust/tuple.h>
 #include <thrust/detail/tuple_meta_transform.h>
 #include <thrust/detail/tuple_transform.h>
 #include <thrust/detail/type_traits.h>
 #include <thrust/iterator/detail/tuple_of_iterator_references.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 // forward declare zip_iterator for zip_iterator_base
 template<typename IteratorTuple> class zip_iterator;
@@ -45,12 +48,12 @@ class advance_iterator
 public:
   inline __host__ __device__
   advance_iterator(DiffType step) : m_step(step) {}
-  
+
   __thrust_exec_check_disable__
   template<typename Iterator>
   inline __host__ __device__
   void operator()(Iterator& it) const
-  { it += m_step; }
+  { thrust::advance(it, m_step); }
 
 private:
   DiffType m_step;
@@ -127,17 +130,28 @@ template<class Tuple, class BinaryMetaFun, class StartType>
   struct tuple_meta_accumulate;
 
 template<
-    typename Tuple
-  , class BinaryMetaFun
+    class BinaryMetaFun
+  , typename StartType
+>
+  struct tuple_meta_accumulate<thrust::tuple<>,BinaryMetaFun,StartType>
+{
+   typedef typename thrust::detail::identity_<StartType>::type type;
+};
+
+
+template<
+    class BinaryMetaFun
   , typename StartType
+  , typename    T
+  , typename... Ts
 >
-  struct tuple_meta_accumulate_impl
+  struct tuple_meta_accumulate<thrust::tuple<T,Ts...>,BinaryMetaFun,StartType>
 {
    typedef typename apply2<
        BinaryMetaFun
-     , typename Tuple::head_type
+     , T
      , typename tuple_meta_accumulate<
-           typename Tuple::tail_type
+           thrust::tuple<Ts...>
          , BinaryMetaFun
          , StartType 
        >::type
@@ -145,81 +159,40 @@ template<
 };
 
 
-template<
-    typename Tuple
-  , class BinaryMetaFun
-  , typename StartType
->
-struct tuple_meta_accumulate
-  : thrust::detail::eval_if<
-        thrust::detail::is_same<Tuple, thrust::null_type>::value
-      , thrust::detail::identity_<StartType>
-      , tuple_meta_accumulate_impl<
-            Tuple
-          , BinaryMetaFun
-          , StartType
-        >
-    > // end eval_if
+template<typename Fun>
+inline __host__ __device__
+Fun tuple_for_each_helper(Fun f)
 {
-}; // end tuple_meta_accumulate
-
-
-// transform algorithm for tuples. The template parameter Fun
-// must be a unary functor which is also a unary metafunction
-// class that computes its return type based on its argument
-// type. For example:
-//
-// struct to_ptr
-// {
-//     template <class Arg>
-//     struct apply
-//     {
-//          typedef Arg* type;
-//     }
-//
-//     template <class Arg>
-//     Arg* operator()(Arg x);
-// };
-
+  return f;
+}
 
+template<typename Fun, typename T, typename... Ts>
+inline __host__ __device__
+Fun tuple_for_each_helper(Fun f, T& t, Ts&... ts)
+{
+  f(t);
+  return tuple_for_each_helper(f, ts...);
+}
 
 // for_each algorithm for tuples.
-template<typename Fun>
+
+template<typename Fun, typename... Ts, size_t... Is>
 inline __host__ __device__
-Fun tuple_for_each(thrust::null_type, Fun f)
+Fun tuple_for_each(thrust::tuple<Ts...>& t, Fun f, thrust::index_sequence<Is...>)
 {
-  return f;
+  return tuple_for_each_helper(f, thrust::get<Is>(t)...);
 } // end tuple_for_each()
 
-
-template<typename Tuple, typename Fun>
+// for_each algorithm for tuples.
+template<typename Fun, typename... Ts>
 inline __host__ __device__
-Fun tuple_for_each(Tuple& t, Fun f)
+Fun tuple_for_each(thrust::tuple<Ts...>& t, Fun f)
 { 
-  f( t.get_head() );
-  return tuple_for_each(t.get_tail(), f);
-} // end tuple_for_each()
+  return tuple_for_each(t, f, thrust::make_index_sequence<thrust::tuple_size<thrust::tuple<Ts...>>::value>{});
+}
 
 
-// Equality of tuples. NOTE: "==" for tuples currently (7/2003)
-// has problems under some compilers, so I just do my own.
-// No point in bringing in a bunch of #ifdefs here. This is
-// going to go away with the next tuple implementation anyway.
-//
-__host__ __device__
-inline bool tuple_equal(thrust::null_type, thrust::null_type)
-{ return true; }
-
-
-template<typename Tuple1, typename Tuple2>
-__host__ __device__
-bool tuple_equal(Tuple1 const& t1, Tuple2 const& t2)
-{ 
-  return t1.get_head() == t2.get_head() && 
-  tuple_equal(t1.get_tail(), t2.get_tail());
-} // end tuple_equal()
-
-} // end end tuple_impl_specific
+} // end tuple_impl_specific
 
 
 // Metafunction to obtain the type of the tuple whose element types
@@ -293,29 +266,16 @@ namespace zip_iterator_base_ns
 {
 
 
-template<int i, typename Tuple>
-  struct tuple_elements_helper
-    : eval_if<
-        (i < tuple_size<Tuple>::value),
-        tuple_element<i,Tuple>,
-        identity_<thrust::null_type>
-      >
-{};
+template<typename Tuple, typename IndexSequence>
+  struct tuple_of_iterator_references_helper;
 
 
-template<typename Tuple>
-  struct tuple_elements
+template<typename Tuple, size_t... Is>
+  struct tuple_of_iterator_references_helper<Tuple, thrust::index_sequence<Is...>>
 {
-  typedef typename tuple_elements_helper<0,Tuple>::type T0;
-  typedef typename tuple_elements_helper<1,Tuple>::type T1;
-  typedef typename tuple_elements_helper<2,Tuple>::type T2;
-  typedef typename tuple_elements_helper<3,Tuple>::type T3;
-  typedef typename tuple_elements_helper<4,Tuple>::type T4;
-  typedef typename tuple_elements_helper<5,Tuple>::type T5;
-  typedef typename tuple_elements_helper<6,Tuple>::type T6;
-  typedef typename tuple_elements_helper<7,Tuple>::type T7;
-  typedef typename tuple_elements_helper<8,Tuple>::type T8;
-  typedef typename tuple_elements_helper<9,Tuple>::type T9;
+  typedef thrust::detail::tuple_of_iterator_references<
+    typename thrust::tuple_element<Is,Tuple>::type...
+  > type;
 };
 
 
@@ -328,22 +288,11 @@ template<typename IteratorTuple>
     iterator_reference
   >::type tuple_of_references;
 
-  // get at the individual tuple element types by name
-  typedef tuple_elements<tuple_of_references> elements;
-
   // map thrust::tuple<T...> to tuple_of_iterator_references<T...>
-  typedef thrust::detail::tuple_of_iterator_references<
-    typename elements::T0,
-    typename elements::T1,
-    typename elements::T2,
-    typename elements::T3,
-    typename elements::T4,
-    typename elements::T5,
-    typename elements::T6,
-    typename elements::T7,
-    typename elements::T8,
-    typename elements::T9
-  > type;
+  typedef typename tuple_of_iterator_references_helper<
+    tuple_of_references,
+    thrust::make_index_sequence<thrust::tuple_size<tuple_of_references>::value>
+  >::type type;
 };
 
 
@@ -399,6 +348,6 @@ template<typename IteratorTuple>
 
 } // end detail
 
-} // end thrust
+THRUST_NAMESPACE_END
 
 
diff --git a/thrust/iterator/discard_iterator.h b/thrust/iterator/discard_iterator.h
index d0603e2c0..eb5156eda 100644
--- a/thrust/iterator/discard_iterator.h
+++ b/thrust/iterator/discard_iterator.h
@@ -27,8 +27,7 @@
 
 THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_BEGIN
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup iterators
  *  \{
@@ -81,9 +80,9 @@ namespace thrust
  *                          values.begin(),
  *                          thrust::make_discard_iterator(),
  *                          result.begin());
- *    
+ *
  *    // result is now [9, 21, 9, 3]
- *    
+ *
  *    return 0;
  *  }
  *  \endcode
@@ -116,9 +115,13 @@ template<typename System = use_default>
     discard_iterator(discard_iterator const &rhs)
       : super_t(rhs.base()) {}
 
+#if THRUST_CPP_DIALECT >= 2011
+    discard_iterator & operator=(const discard_iterator &) = default;
+#endif
+
     /*! This constructor receives an optional index specifying the position of this
      *  \p discard_iterator in a range.
-     *  
+     *
      *  \p i The index of this \p discard_iterator in a range. Defaults to the
      *       value returned by \c Incrementable's null constructor. For example,
      *       when <tt>Incrementable == int</tt>, \c 0.
@@ -129,7 +132,7 @@ template<typename System = use_default>
 
     /*! \cond
      */
-  
+
   private: // Core iterator interface
     __host__ __device__
     reference dereference() const
@@ -165,7 +168,7 @@ discard_iterator<> make_discard_iterator(discard_iterator<>::difference_type i =
 /*! \} // end iterators
  */
 
-} // end namespace thrust
-  
+THRUST_NAMESPACE_END
+
 THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_END
 
diff --git a/thrust/iterator/iterator_adaptor.h b/thrust/iterator/iterator_adaptor.h
index c3c9b8655..67d4866b9 100644
--- a/thrust/iterator/iterator_adaptor.h
+++ b/thrust/iterator/iterator_adaptor.h
@@ -37,8 +37,7 @@
 #include <thrust/detail/use_default.h>
 #include <thrust/iterator/detail/iterator_adaptor_base.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup iterators
  *  \{
@@ -201,7 +200,10 @@ template<typename Derived,
     void advance(typename iterator_adaptor::difference_type n)
     {
       // XXX statically assert on random_access_traversal_tag
-      m_iterator += n;
+
+      // counting_iterator will pick eg. diff_t=int64 when base=int32.
+      // Explicitly cast to avoid static conversion warnings.
+      m_iterator = static_cast<base_type>(m_iterator + n);
     }
 
     __thrust_exec_check_disable__
@@ -236,5 +238,5 @@ template<typename Derived,
 /*! \} // end iterators
  */
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/iterator/iterator_categories.h b/thrust/iterator/iterator_categories.h
index 02246d446..9a6f3f4ae 100644
--- a/thrust/iterator/iterator_categories.h
+++ b/thrust/iterator/iterator_categories.h
@@ -39,8 +39,7 @@
 // #include this for stl's iterator tags
 #include <iterator>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup iterators
  *  \addtogroup iterator_tags Iterator Tags
@@ -55,7 +54,7 @@ namespace thrust
  *  representation of the Input Device Iterator concept within the C++ type
  *  system.
  *
- *  \see http://www.sgi.com/tech/sgi/input_iterator_tag.html, iterator_traits,
+ *  \see https://en.cppreference.com/w/cpp/iterator/iterator_tags  iterator_traits,
  *  output_device_iterator_tag, forward_device_iterator_tag,
  *  bidirectional_device_iterator_tag, random_access_device_iterator_tag,
  *  input_host_iterator_tag, output_host_iterator_tag, forward_host_iterator_tag,
@@ -74,7 +73,7 @@ struct input_device_iterator_tag
  *  representation of the Output Device Iterator concept within the C++ type
  *  system.
  *
- *  \see http://www.sgi.com/tech/sgi/output_iterator_tag.html, iterator_traits,
+ *  \see https://en.cppreference.com/w/cpp/iterator/iterator_tags  iterator_traits,
  *  input_device_iterator_tag, forward_device_iterator_tag,
  *  bidirectional_device_iterator_tag, random_access_device_iterator_tag,
  *  input_host_iterator_tag, output_host_iterator_tag, forward_host_iterator_tag,
@@ -93,7 +92,7 @@ struct output_device_iterator_tag
  *  representation of the Forward Device Iterator concept within the C++ type
  *  system.
  *
- *  \see http://www.sgi.com/tech/sgi/forward_iterator_tag.html, iterator_traits,
+ *  \see https://en.cppreference.com/w/cpp/iterator/iterator_tags  iterator_traits,
  *  input_device_iterator_tag, output_device_iterator_tag,
  *  bidirectional_device_iterator_tag, random_access_device_iterator_tag,
  *  input_host_iterator_tag, output_host_iterator_tag, forward_host_iterator_tag,
@@ -112,7 +111,7 @@ struct forward_device_iterator_tag
  *  representation of the Bidirectional Device Iterator concept within the C++
  *  type system.
  *
- *  \see http://www.sgi.com/tech/sgi/bidirectional_iterator_tag.html,
+ *  \see https://en.cppreference.com/w/cpp/iterator/iterator_tags 
  *  iterator_traits, input_device_iterator_tag, output_device_iterator_tag,
  *  forward_device_iterator_tag, random_access_device_iterator_tag,
  *  input_host_iterator_tag, output_host_iterator_tag, forward_host_iterator_tag,
@@ -131,7 +130,7 @@ struct bidirectional_device_iterator_tag
  *  representation of the Random Access Device Iterator concept within the C++
  *  type system.
  *
- *  \see http://www.sgi.com/tech/sgi/random_access_iterator_tag.html,
+ *  \see https://en.cppreference.com/w/cpp/iterator/iterator_tags 
  *  iterator_traits, input_device_iterator_tag, output_device_iterator_tag,
  *  forward_device_iterator_tag, bidirectional_device_iterator_tag,
  *  input_host_iterator_tag, output_host_iterator_tag, forward_host_iterator_tag,
@@ -150,7 +149,7 @@ struct random_access_device_iterator_tag
  *  representation of the Input Host Iterator concept within the C++
  *  type system.
  *
- *  \see http://www.sgi.com/tech/sgi/input_iterator_tag.html,
+ *  \see https://en.cppreference.com/w/cpp/iterator/iterator_tags 
  *  iterator_traits, input_device_iterator_tag, output_device_iterator_tag,
  *  forward_device_iterator_tag, bidirectional_device_iterator_tag,
  *  random_access_device_iterator_tag,
@@ -164,7 +163,7 @@ typedef std::input_iterator_tag input_host_iterator_tag;
  *  representation of the Output Host Iterator concept within the C++
  *  type system.
  *
- *  \see http://www.sgi.com/tech/sgi/output_iterator_tag.html,
+ *  \see https://en.cppreference.com/w/cpp/iterator/iterator_tags 
  *  iterator_traits, input_device_iterator_tag, output_device_iterator_tag,
  *  forward_device_iterator_tag, bidirectional_device_iterator_tag,
  *  random_access_device_iterator_tag,
@@ -178,7 +177,7 @@ typedef std::output_iterator_tag output_host_iterator_tag;
  *  representation of the Forward Host Iterator concept within the C++
  *  type system.
  *
- *  \see http://www.sgi.com/tech/sgi/forward_iterator_tag.html,
+ *  \see https://en.cppreference.com/w/cpp/iterator/iterator_tags 
  *  iterator_traits, input_device_iterator_tag, output_device_iterator_tag,
  *  forward_device_iterator_tag, bidirectional_device_iterator_tag,
  *  random_access_device_iterator_tag,
@@ -192,7 +191,7 @@ typedef std::forward_iterator_tag forward_host_iterator_tag;
  *  representation of the Forward Host Iterator concept within the C++
  *  type system.
  *
- *  \see http://www.sgi.com/tech/sgi/bidirectional_iterator_tag.html,
+ *  \see https://en.cppreference.com/w/cpp/iterator/iterator_tags 
  *  iterator_traits, input_device_iterator_tag, output_device_iterator_tag,
  *  forward_device_iterator_tag, bidirectional_device_iterator_tag,
  *  random_access_device_iterator_tag,
@@ -206,7 +205,7 @@ typedef std::bidirectional_iterator_tag bidirectional_host_iterator_tag;
  *  representation of the Forward Host Iterator concept within the C++
  *  type system.
  *
- *  \see http://www.sgi.com/tech/sgi/random_access_iterator_tag.html,
+ *  \see https://en.cppreference.com/w/cpp/iterator/iterator_tags 
  *  iterator_traits, input_device_iterator_tag, output_device_iterator_tag,
  *  forward_device_iterator_tag, bidirectional_device_iterator_tag,
  *  random_access_device_iterator_tag,
@@ -218,7 +217,7 @@ typedef std::random_access_iterator_tag random_access_host_iterator_tag;
 /*! \} // end iterator_tag_classes
  */
 
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/iterator/detail/universal_categories.h>
 
diff --git a/thrust/iterator/iterator_facade.h b/thrust/iterator/iterator_facade.h
index 86757d712..f6920c5c8 100644
--- a/thrust/iterator/iterator_facade.h
+++ b/thrust/iterator/iterator_facade.h
@@ -37,8 +37,7 @@
 #include <thrust/iterator/detail/iterator_facade_category.h>
 #include <thrust/iterator/detail/distance_from_result.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup iterators
  *  \{
@@ -539,5 +538,5 @@ Derived operator+ (typename Derived::difference_type n,
 /*! \} // end iterators
  */
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/iterator/iterator_traits.h b/thrust/iterator/iterator_traits.h
index 5a33658c2..b2f4b175a 100644
--- a/thrust/iterator/iterator_traits.h
+++ b/thrust/iterator/iterator_traits.h
@@ -35,62 +35,13 @@
 
 #include <iterator>
 
-namespace thrust
-{
-
-namespace detail
-{
-
-template <typename T, typename = void>
-struct iterator_traits_impl {};
-
-template <typename T>
-struct iterator_traits_impl<
-  T
-, typename voider<
-    typename T::difference_type
-  , typename T::value_type
-  , typename T::pointer
-  , typename T::reference
-  , typename T::iterator_category
-  >::type 
->
-{
-  typedef typename T::difference_type difference_type;
-  typedef typename T::value_type value_type;
-  typedef typename T::pointer pointer;
-  typedef typename T::reference reference;
-  typedef typename T::iterator_category iterator_category;
-};
-
-} // namespace detail
+THRUST_NAMESPACE_BEGIN
 
 /*! \p iterator_traits is a type trait class that provides a uniform
  *  interface for querying the properties of iterators at compile-time.
  */
 template <typename T>
-struct iterator_traits : detail::iterator_traits_impl<T> {};
-
-// traits are specialized for pointer types
-template<typename T>
-  struct iterator_traits<T*>
-{
-  typedef std::ptrdiff_t difference_type;
-  typedef T value_type;
-  typedef T* pointer;
-  typedef T& reference;
-  typedef std::random_access_iterator_tag iterator_category;
-};
-
-template<typename T>
-  struct iterator_traits<const T*>
-{
-  typedef std::ptrdiff_t difference_type;
-  typedef T value_type;
-  typedef const T* pointer;
-  typedef const T& reference;
-  typedef std::random_access_iterator_tag iterator_category;
-}; // end iterator_traits
+struct iterator_traits : std::iterator_traits<T> {};
 
 template<typename Iterator> struct iterator_value;
 
@@ -104,7 +55,7 @@ template<typename Iterator> struct iterator_traversal;
 
 template<typename Iterator> struct iterator_system;
 
-} // namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/iterator/detail/iterator_traversal_tags.h>
 #include <thrust/iterator/detail/host_system_tag.h>
diff --git a/thrust/iterator/permutation_iterator.h b/thrust/iterator/permutation_iterator.h
index 73827040a..be5010e54 100644
--- a/thrust/iterator/permutation_iterator.h
+++ b/thrust/iterator/permutation_iterator.h
@@ -37,8 +37,7 @@
 #include <thrust/iterator/iterator_facade.h>
 #include <thrust/iterator/iterator_traits.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 
 /*! \addtogroup iterators
@@ -75,7 +74,7 @@ namespace thrust
  *  #include <thrust/iterator/permutation_iterator.h>
  *  #include <thrust/device_vector.h>
  *  ...
- *  thrust::device_vector<float> values(4);
+ *  thrust::device_vector<float> values(8);
  *  values[0] = 10.0f;
  *  values[1] = 20.0f;
  *  values[2] = 30.0f;
@@ -213,5 +212,5 @@ permutation_iterator<ElementIterator,IndexIterator> make_permutation_iterator(El
 /*! \} // end iterators
  */
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/iterator/retag.h b/thrust/iterator/retag.h
index 6adf5e244..1eb770ae3 100644
--- a/thrust/iterator/retag.h
+++ b/thrust/iterator/retag.h
@@ -23,8 +23,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/iterator/detail/retag.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 
 /*! \ingroup iterator_tags
@@ -66,5 +65,5 @@ unspecified_iterator_type retag(Iterator iter);
  */
 
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/iterator/reverse_iterator.h b/thrust/iterator/reverse_iterator.h
index 2ba97d0ac..fe8bbe0cf 100644
--- a/thrust/iterator/reverse_iterator.h
+++ b/thrust/iterator/reverse_iterator.h
@@ -37,8 +37,7 @@
 #include <thrust/iterator/detail/reverse_iterator_base.h>
 #include <thrust/iterator/iterator_facade.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup iterators
  *  \{
@@ -180,14 +179,14 @@ template<typename BidirectionalIterator>
     reverse_iterator(reverse_iterator<OtherBidirectionalIterator> const &r
 // XXX msvc screws this up
 // XXX remove these guards when we have static_assert
-#ifndef _MSC_VER
+#if THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC
                      , typename thrust::detail::enable_if<
                          thrust::detail::is_convertible<
                            OtherBidirectionalIterator,
                            BidirectionalIterator
                          >::value
                        >::type * = 0
-#endif // _MSC_VER
+#endif // MSVC
                      );
 
   /*! \cond
@@ -232,7 +231,7 @@ reverse_iterator<BidirectionalIterator> make_reverse_iterator(BidirectionalItera
 /*! \} // end iterators
  */
 
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/iterator/detail/reverse_iterator.inl>
 
diff --git a/thrust/iterator/transform_input_output_iterator.h b/thrust/iterator/transform_input_output_iterator.h
new file mode 100644
index 000000000..a5f725dc5
--- /dev/null
+++ b/thrust/iterator/transform_input_output_iterator.h
@@ -0,0 +1,164 @@
+/*
+ *  Copyright 2020 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file thrust/iterator/transform_input_output_iterator.h
+ *  \brief An iterator which adapts another iterator by applying transform
+ *         functions when reading and writing dereferenced values.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/iterator/detail/transform_input_output_iterator.inl>
+
+THRUST_NAMESPACE_BEGIN
+
+/*! \addtogroup iterators
+ *  \{
+ */
+
+/*! \addtogroup fancyiterator Fancy Iterators
+ *  \ingroup iterators
+ *  \{
+ */
+
+/*! \p transform_input_output_iterator is a special kind of iterator which applies
+ * transform functions when reading from or writing to dereferenced values.
+ * This iterator is useful for algorithms that operate on a type that needs to
+ * be serialized/deserialized from values in another iterator, avoiding the
+ * need to materialize intermediate results in memory. This also enables the
+ * transform functions to be fused with the operations that read and write to
+ * the `transform_input_output_iterator`.
+ *
+ * The following code snippet demonstrates how to create a
+ * \p transform_input_output_iterator which performs different transformations when
+ * reading from and writing to the iterator.
+ *
+ * \code
+ * #include <thrust/iterator/transform_input_output_iterator.h>
+ * #include <thrust/device_vector.h>
+ *
+ *  int main()
+ *  {
+ *    const size_t size = 4;
+ *    thrust::device_vector<float> v(size);
+ *
+ *    // Write 1.0f, 2.0f, 3.0f, 4.0f to vector
+ *    thrust::sequence(v.begin(), v.end(), 1);
+ *
+ *    // Iterator that returns negated values and writes squared values
+ *    auto iter = thrust::make_transform_input_output_iterator(v.begin(),
+ *        thrust::negate<float>{}, thrust::square<float>{});
+ * 
+ *    // Iterator negates values when reading
+ *    std::cout << iter[0] << " ";  // -1.0f;
+ *    std::cout << iter[1] << " ";  // -2.0f;
+ *    std::cout << iter[2] << " ";  // -3.0f;
+ *    std::cout << iter[3] << "\n"; // -4.0f;
+ *
+ *    // Write 1.0f, 2.0f, 3.0f, 4.0f to iterator
+ *    thrust::sequence(iter, iter + size, 1);
+ *
+ *    // Values were squared before writing to vector
+ *    std::cout << v[0] << " ";  // 1.0f;
+ *    std::cout << v[1] << " ";  // 4.0f;
+ *    std::cout << v[2] << " ";  // 9.0f;
+ *    std::cout << v[3] << "\n"; // 16.0f;
+ *
+ *  }
+ * \endcode
+ *
+ * \see make_transform_input_output_iterator
+ */
+
+template <typename InputFunction, typename OutputFunction, typename Iterator>
+  class transform_input_output_iterator
+    : public detail::transform_input_output_iterator_base<InputFunction, OutputFunction, Iterator>::type
+{
+
+  /*! \cond
+   */
+
+  public:
+
+    typedef typename
+    detail::transform_input_output_iterator_base<InputFunction, OutputFunction, Iterator>::type
+    super_t;
+
+    friend class thrust::iterator_core_access;
+  /*! \endcond
+   */
+
+  transform_input_output_iterator() = default;
+
+  /*! This constructor takes as argument a \c Iterator an \c InputFunction and an
+   * \c OutputFunction and copies them to a new \p transform_input_output_iterator
+   *
+   * \param io An \c Iterator pointing to where the input to \c InputFunction
+   *           will be read from and the result of \c OutputFunction will be written to
+   * \param input_function An \c InputFunction to be executed on values read from the iterator
+   * \param output_function An \c OutputFunction to be executed on values written to the iterator
+   */
+    __host__ __device__
+    transform_input_output_iterator(Iterator const& io, InputFunction input_function, OutputFunction output_function)
+      : super_t(io), input_function(input_function), output_function(output_function)
+    {
+    }
+
+    /*! \cond
+     */
+  private:
+
+    __host__ __device__
+    typename super_t::reference dereference() const
+    {
+      return detail::transform_input_output_iterator_proxy<
+        InputFunction, OutputFunction, Iterator
+      >(this->base_reference(), input_function, output_function);
+    }
+
+    InputFunction input_function;
+    OutputFunction output_function;
+
+    /*! \endcond
+     */
+}; // end transform_input_output_iterator
+
+/*! \p make_transform_input_output_iterator creates a \p transform_input_output_iterator from
+ *  an \c Iterator a \c InputFunction and a \c OutputFunction
+ *
+ * \param io An \c Iterator pointing to where the input to \c InputFunction
+ *           will be read from and the result of \c OutputFunction will be written to
+ * \param input_function An \c InputFunction to be executed on values read from the iterator
+ * \param output_function An \c OutputFunction to be executed on values written to the iterator
+ *  \see transform_input_output_iterator
+ */
+template <typename InputFunction, typename OutputFunction, typename Iterator>
+transform_input_output_iterator<InputFunction, OutputFunction, Iterator>
+__host__ __device__
+make_transform_input_output_iterator(Iterator io, InputFunction input_function, OutputFunction output_function)
+{
+    return transform_input_output_iterator<InputFunction, OutputFunction, Iterator>(io, input_function, output_function);
+} // end make_transform_input_output_iterator
+
+/*! \} // end fancyiterators
+ */
+
+/*! \} // end iterators
+ */
+
+THRUST_NAMESPACE_END
+
diff --git a/thrust/iterator/transform_iterator.h b/thrust/iterator/transform_iterator.h
index 2102d9857..5afb5f37b 100644
--- a/thrust/iterator/transform_iterator.h
+++ b/thrust/iterator/transform_iterator.h
@@ -16,14 +16,14 @@
 
 
 /*! \file thrust/iterator/transform_iterator.h
- *  \brief An iterator which adapts another iterator by applying a function to the result of its dereference 
+ *  \brief An iterator which adapts another iterator by applying a function to the result of its dereference
  */
 
 /*
  * (C) Copyright David Abrahams 2002.
  * (C) Copyright Jeremy Siek    2002.
  * (C) Copyright Thomas Witt    2002.
- * 
+ *
  * Distributed under the Boost Software License, Version 1.0.
  * (See accompanying NOTICE file for the complete license)
  *
@@ -40,8 +40,7 @@
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/detail/type_traits.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup iterators
  *  \{
@@ -53,7 +52,7 @@ namespace thrust
  */
 
 /*! \p transform_iterator is an iterator which represents a pointer into a range
- *  of values after transformation by a function. This iterator is useful for 
+ *  of values after transformation by a function. This iterator is useful for
  *  creating a range filled with the result of applying an operation to another range
  *  without either explicitly storing it in memory, or explicitly executing the transformation.
  *  Using \p transform_iterator facilitates kernel fusion by deferring the execution
@@ -66,7 +65,7 @@ namespace thrust
  *  \code
  *  #include <thrust/iterator/transform_iterator.h>
  *  #include <thrust/device_vector.h>
- *  
+ *
  *  // note: functor inherits from unary_function
  *  struct square_root : public thrust::unary_function<float,float>
  *  {
@@ -76,7 +75,7 @@ namespace thrust
  *      return sqrtf(x);
  *    }
  *  };
- *  
+ *
  *  int main()
  *  {
  *    thrust::device_vector<float> v(4);
@@ -84,17 +83,17 @@ namespace thrust
  *    v[1] = 4.0f;
  *    v[2] = 9.0f;
  *    v[3] = 16.0f;
- *                                                                                           
+ *
  *    typedef thrust::device_vector<float>::iterator FloatIterator;
- *                                                                                           
+ *
  *    thrust::transform_iterator<square_root, FloatIterator> iter(v.begin(), square_root());
- *                                                                                           
+ *
  *    *iter;   // returns 1.0f
  *    iter[0]; // returns 1.0f;
  *    iter[1]; // returns 2.0f;
  *    iter[2]; // returns 3.0f;
  *    iter[3]; // returns 4.0f;
- *                                                                                           
+ *
  *    // iter[4] is an out-of-bounds error
  *  }
  *  \endcode
@@ -109,7 +108,7 @@ namespace thrust
  *  #include <thrust/device_vector.h>
  *  #include <thrust/reduce.h>
  *  #include <iostream>
- *  
+ *
  *  // note: functor inherits from unary_function
  *  struct square : public thrust::unary_function<float,float>
  *  {
@@ -119,7 +118,7 @@ namespace thrust
  *      return x * x;
  *    }
  *  };
- *  
+ *
  *  int main()
  *  {
  *    // initialize a device array
@@ -128,29 +127,29 @@ namespace thrust
  *    v[1] = 2.0f;
  *    v[2] = 3.0f;
  *    v[3] = 4.0f;
- *  
+ *
  *    float sum_of_squares =
  *     thrust::reduce(thrust::make_transform_iterator(v.begin(), square()),
  *                    thrust::make_transform_iterator(v.end(),   square()));
- *  
+ *
  *    std::cout << "sum of squares: " << sum_of_squares << std::endl;
  *    return 0;
  *  }
  *  \endcode
  *
- *  Note that in the previous two examples the transform functor (namely \c square_root 
- *  and \c square) inherits from \c thrust::unary_function.  Inheriting from 
+ *  Note that in the previous two examples the transform functor (namely \c square_root
+ *  and \c square) inherits from \c thrust::unary_function.  Inheriting from
  *  \c thrust::unary_function ensures that a functor is a valid \c AdaptableUnaryFunction
  *  and provides all the necessary \c typedef declarations.  The \p transform_iterator
- *  can also be applied to a \c UnaryFunction that does not inherit from 
+ *  can also be applied to a \c UnaryFunction that does not inherit from
  *  \c thrust::unary_function using an optional template argument.  The following example
  *  illustrates how to use the third template argument to specify the \c result_type of
- *  the function.   
+ *  the function.
  *
  *  \code
  *  #include <thrust/iterator/transform_iterator.h>
  *  #include <thrust/device_vector.h>
- *  
+ *
  *  // note: functor *does not* inherit from unary_function
  *  struct square_root
  *  {
@@ -160,7 +159,7 @@ namespace thrust
  *      return sqrtf(x);
  *    }
  *  };
- *  
+ *
  *  int main()
  *  {
  *    thrust::device_vector<float> v(4);
@@ -168,18 +167,18 @@ namespace thrust
  *    v[1] = 4.0f;
  *    v[2] = 9.0f;
  *    v[3] = 16.0f;
- *                                                                                           
+ *
  *    typedef thrust::device_vector<float>::iterator FloatIterator;
- *    
+ *
  *    // note: float result_type is specified explicitly
  *    thrust::transform_iterator<square_root, FloatIterator, float> iter(v.begin(), square_root());
- *                                                                                           
+ *
  *    *iter;   // returns 1.0f
  *    iter[0]; // returns 1.0f;
  *    iter[1]; // returns 2.0f;
  *    iter[2]; // returns 3.0f;
  *    iter[3]; // returns 4.0f;
- *                                                                                           
+ *
  *    // iter[4] is an out-of-bounds error
  *  }
  *  \endcode
@@ -206,7 +205,11 @@ template <class AdaptableUnaryFunction, class Iterator, class Reference = use_de
      */
     __host__ __device__
     transform_iterator() {}
-  
+
+#if THRUST_CPP_DIALECT >= 2011
+    transform_iterator(transform_iterator const&) = default;
+#endif
+
     /*! This constructor takes as arguments an \c Iterator and an \c AdaptableUnaryFunction
      *  and copies them to a new \p transform_iterator.
      *
@@ -217,7 +220,7 @@ template <class AdaptableUnaryFunction, class Iterator, class Reference = use_de
     transform_iterator(Iterator const& x, AdaptableUnaryFunction f)
       : super_t(x), m_f(f) {
     }
-  
+
     /*! This explicit constructor copies the value of a given \c Iterator and creates
      *  this \p transform_iterator's \c AdaptableUnaryFunction using its null constructor.
      *
@@ -304,11 +307,11 @@ template <class AdaptableUnaryFunction, class Iterator, class Reference = use_de
     __thrust_exec_check_disable__
     __host__ __device__
     typename super_t::reference dereference() const
-    {  
+    {
       // Create a temporary to allow iterators with wrapped references to
       // convert to their value type before calling m_f. Note that this
-      // disallows non-constant operations through m_f. 
-      typename thrust::iterator_value<Iterator>::type x = *this->base();
+      // disallows non-constant operations through m_f.
+      typename thrust::iterator_value<Iterator>::type const& x = *this->base();
       return m_f(x);
     }
 
@@ -348,5 +351,5 @@ make_transform_iterator(Iterator it, AdaptableUnaryFunction fun)
 /*! \} // end iterators
  */
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/iterator/transform_output_iterator.h b/thrust/iterator/transform_output_iterator.h
index 81fbcbbbd..3ac4b8572 100644
--- a/thrust/iterator/transform_output_iterator.h
+++ b/thrust/iterator/transform_output_iterator.h
@@ -1,7 +1,7 @@
 /*
  *  Copyright 2008-2018 NVIDIA Corporation
  *
- *  Licensed under the Apache License, Vesion 2.0 (the "License");
+ *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
  *  You may obtain a copy of the License at
  *
@@ -24,8 +24,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/iterator/detail/transform_output_iterator.inl>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup iterators
  *  \{
@@ -105,6 +104,8 @@ template <typename UnaryFunction, typename OutputIterator>
   /*! \endcond
    */
 
+  transform_output_iterator() = default;
+
   /*! This constructor takes as argument an \c OutputIterator and an \c
    * UnaryFunction and copies them to a new \p transform_output_iterator
    *
@@ -159,5 +160,5 @@ make_transform_output_iterator(OutputIterator out, UnaryFunction fun)
 /*! \} // end iterators
  */
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/iterator/zip_iterator.h b/thrust/iterator/zip_iterator.h
index 7b86d06d5..c2dd5ddc4 100644
--- a/thrust/iterator/zip_iterator.h
+++ b/thrust/iterator/zip_iterator.h
@@ -36,8 +36,7 @@
 #include <thrust/iterator/iterator_facade.h>
 #include <thrust/detail/type_traits.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup iterators
  *  \{
@@ -229,9 +228,23 @@ template <typename IteratorTuple>
  *
  *  \see zip_iterator
  */
-template<typename IteratorTuple>
+template<typename... Iterators>
 inline __host__ __device__
-zip_iterator<IteratorTuple> make_zip_iterator(IteratorTuple t);
+zip_iterator<thrust::tuple<Iterators...>> make_zip_iterator(thrust::tuple<Iterators...> t);
+
+
+/*! \p make_zip_iterator creates a \p zip_iterator from
+ *  iterators.
+ *
+ *  \param its The iterators to copy.
+ *  \return A newly created \p zip_iterator which zips the iterators.
+ *
+ *  \see zip_iterator
+ */
+template<typename... Iterators>
+inline __host__ __device__
+zip_iterator<thrust::tuple<Iterators...>> make_zip_iterator(Iterators... its);
+
 
 /*! \} // end fancyiterators
  */
@@ -239,7 +252,7 @@ zip_iterator<IteratorTuple> make_zip_iterator(IteratorTuple t);
 /*! \} // end iterators
  */
 
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/iterator/detail/zip_iterator.inl>
 
diff --git a/thrust/limits.h b/thrust/limits.h
index 10434a3cf..52f38b1fc 100644
--- a/thrust/limits.h
+++ b/thrust/limits.h
@@ -7,12 +7,12 @@
 
 #include <limits>
 
+#include <thrust/detail/config.h>
 #include <thrust/detail/type_traits.h>
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
 
 template <typename T>
 struct numeric_limits : std::numeric_limits<T> {};
 
-THRUST_END_NS
-
+THRUST_NAMESPACE_END
diff --git a/thrust/logical.h b/thrust/logical.h
index ce2127219..5a8dbbecf 100644
--- a/thrust/logical.h
+++ b/thrust/logical.h
@@ -24,9 +24,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/detail/execution_policy.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup reductions
  *  \{
@@ -50,8 +48,8 @@ namespace thrust
  *  \return \c true, if all elements satisfy the predicate; \c false, otherwise.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam Predicate must be a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *  \tparam Predicate must be a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \code
  *  #include <thrust/logical.h>
@@ -87,8 +85,8 @@ bool all_of(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, In
  *  \param pred A predicate used to test range elements.
  *  \return \c true, if all elements satisfy the predicate; \c false, otherwise.
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam Predicate must be a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *  \tparam Predicate must be a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \code
  *  #include <thrust/logical.h>
@@ -126,8 +124,8 @@ bool all_of(InputIterator first, InputIterator last, Predicate pred);
  *  \return \c true, if any element satisfies the predicate; \c false, otherwise.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam Predicate must be a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *  \tparam Predicate must be a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \code
  *  #include <thrust/logical.h>
@@ -164,8 +162,8 @@ bool any_of(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, In
  *  \param pred A predicate used to test range elements.
  *  \return \c true, if any element satisfies the predicate; \c false, otherwise.
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam Predicate must be a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *  \tparam Predicate must be a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \code
  *  #include <thrust/logical.h>
@@ -204,8 +202,8 @@ bool any_of(InputIterator first, InputIterator last, Predicate pred);
  *  \return \c true, if no element satisfies the predicate; \c false, otherwise.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam Predicate must be a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *  \tparam Predicate must be a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \code
  *  #include <thrust/logical.h>
@@ -242,8 +240,8 @@ bool none_of(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, I
  *  \param pred A predicate used to test range elements.
  *  \return \c true, if no element satisfies the predicate; \c false, otherwise.
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam Predicate must be a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *  \tparam Predicate must be a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \code
  *  #include <thrust/logical.h>
@@ -272,8 +270,6 @@ bool none_of(InputIterator first, InputIterator last, Predicate pred);
  *  \} // end reductions
  */
 
-
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/logical.inl>
-
diff --git a/thrust/memory.h b/thrust/memory.h
index 7a074ee16..819ac2513 100644
--- a/thrust/memory.h
+++ b/thrust/memory.h
@@ -18,8 +18,9 @@
  *  \brief Abstractions for Thrust's memory model.
  */
 
-#include <thrust/detail/config.h>
+#pragma once
 
+#include <thrust/detail/config.h>
 #include <thrust/detail/type_traits/pointer_traits.h>
 #include <thrust/detail/pointer.h>
 #include <thrust/detail/reference.h>
@@ -28,8 +29,7 @@
 #include <thrust/detail/malloc_and_free.h>
 #include <thrust/detail/temporary_buffer.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 /*! \defgroup memory_management Memory Management
  *
@@ -37,8 +37,7 @@ namespace thrust
  *
  */
 
-/** \addtogroup memory_management_classes Memory Management Classes
- *  \ingroup memory_management
+/** \addtogroup memory_management Memory Management
  *  \{
  */
 
@@ -49,7 +48,7 @@ namespace thrust
  *  type ensures type safety when dispatching standard algorithms on ranges resident in memory.
  *
  *  \p pointer generalizes \p device_ptr by relaxing the backend system associated with the \p pointer.
- *  Instead of the backend system specified by \p THRUST_DEFAULT_DEVICE_BACKEND, \p pointer's
+ *  Instead of the backend system specified by \p THRUST_DEVICE_SYSTEM, \p pointer's
  *  system is given by its second template parameter, \p Tag. For the purpose of Thrust dispatch,
  *  <tt>device_ptr<Element></tt> and <tt>pointer<Element,device_system_tag></tt> are considered equivalent.
  *
@@ -82,7 +81,7 @@ template<typename Element, typename Tag, typename Reference = thrust::use_defaul
     /*! The type of the raw pointer
      */
     typedef typename super_t::base_type raw_pointer;
-    
+
     /*! \p pointer's default constructor initializes its encapsulated pointer to \c 0
      */
     __host__ __device__
@@ -112,7 +111,8 @@ template<typename Element, typename Tag, typename Reference = thrust::use_defaul
               pointer<Element,Tag,Reference,Derived>
             >::type * = 0);
 
-    /*! Assignment operator allows assigning from another pointer-like object with related type.
+    /*! Assignment operator allows assigning from another pointer-like object whose element type
+     *  is convertible to \c Element.
      *
      *  \param other The other pointer-like object to assign from.
      *  \return <tt>*this</tt>
@@ -137,141 +137,6 @@ template<typename Element, typename Tag, typename Reference = thrust::use_defaul
 };
 #endif
 
-// define pointer for the purpose of Doxygenating it
-// it is actually defined elsewhere
-#if 0
-/*! \p reference is a wrapped reference to an object stored in memory. \p reference generalizes
- *  \p device_reference by relaxing the type of pointer associated with the object. \p reference
- *  is the type of the result of dereferencing a tagged pointer-like object such as \p pointer, and
- *  intermediates operations on objects existing in a remote memory.
- *
- *  \tparam Element specifies the type of the referent object.
- *  \tparam Pointer specifies the type of the result of taking the address of \p reference.
- *  \tparam Derived allows the client to specify the name of the derived type when \p reference is used as
- *          a base class. This is useful to ensure that assignment to objects of the derived type return
- *          values of the derived type as a result. By default, this type is <tt>reference<Element,Pointer></tt>.
- */
-template<typename Element, typename Pointer, typename Derived = thrust::use_default>
-  class reference
-{
-  public:
-    /*! The type of this \p reference's wrapped pointers.
-     */
-    typedef Pointer                                              pointer;
-
-    /*! The \p value_type of this \p reference.
-     */
-    typedef typename thrust::detail::remove_const<Element>::type value_type;
-
-    /*! This copy constructor initializes this \p reference
-     *  to refer to an object pointed to by the given \p pointer. After
-     *  this \p reference is constructed, it shall refer to the
-     *  object pointed to by \p ptr.
-     *
-     *  \param ptr A \p pointer to copy from.
-     */
-    __host__ __device__
-    explicit reference(const pointer &ptr);
-
-    /*! This copy constructor accepts a const reference to another
-     *  \p reference of related type. After this \p reference is constructed,
-     *  it shall refer to the same object as \p other.
-     *  
-     *  \param other A \p reference to copy from.
-     *  \tparam OtherElement the element type of the other \p reference.
-     *  \tparam OtherPointer the pointer type of the other \p reference.
-     *  \tparam OtherDerived the derived type of the other \p reference.
-     *
-     *  \note This constructor is templated primarily to allow initialization of 
-     *  <tt>reference<const T,...></tt> from <tt>reference<T,...></tt>.
-     */
-    template<typename OtherElement, typename OtherPointer, typename OtherDerived>
-    __host__ __device__
-    reference(const reference<OtherElement,OtherPointer,OtherDerived> &other,
-              typename thrust::detail::enable_if_convertible<
-                typename reference<OtherElement,OtherPointer,OtherDerived>::pointer,
-                pointer
-              >::type * = 0);
-
-    /*! Copy assignment operator copy assigns from another \p reference.
-     *
-     *  \param other The other \p reference to assign from.
-     *  \return <tt>static_cast<derived_type&>(*this)</tt>
-     */
-    __host__ __device__
-    derived_type &operator=(const reference &other);
-
-    /*! Assignment operator copy assigns from another \p reference of related type.
-     *
-     *  \param other The other \p reference to assign from.
-     *  \return <tt>static_cast<derived_type&>(*this)</tt>
-     *
-     *  \tparam OtherElement the element type of the other \p reference.
-     *  \tparam OtherPointer the pointer type of the other \p reference.
-     *  \tparam OtherDerived the derived type of the other \p reference.
-     */
-    template<typename OtherElement, typename OtherPointer, typename OtherDerived>
-    __host__ __device__
-    derived_type &operator=(const reference<OtherElement,OtherPointer,OtherDerived> &other);
-
-    /*! Assignment operator assigns from a \p value_type.
-     *
-     *  \param x The \p value_type to assign from.
-     *  \return <tt>static_cast<derived_type&>(*this)</tt>.
-     */
-    __host__ __device__
-    derived_type &operator=(const value_type &x);
-
-    /*! Address-of operator returns a \p pointer pointing to the object
-     *  referenced by this \p reference. It does not return the address of this
-     *  \p reference.
-     *
-     *  \return A \p pointer pointing to the referenct object.
-     */
-    __host__ __device__
-    pointer operator&() const;
-
-    /*! Conversion operator converts this \p reference to \p value_type by
-     *  returning a copy of the referent object.
-     *  
-     *  \return A copy of the referent object.
-     */
-    __host__ __device__
-    operator value_type () const;
-
-    /*! Swaps the value of the referent object with another.
-     *
-     *  \param other The other \p reference with which to swap.
-     *  \note The argument is of type \p derived_type rather than \p reference.
-     */
-    __host__ __device__
-    void swap(derived_type &other);
-
-    /*! Prefix increment operator increments the referent object.
-     *
-     *  \return <tt>static_Cast<derived_type&>(*this)</tt>.
-     *
-     *  \note Documentation for other arithmetic operators omitted for brevity.
-     */
-    derived_type &operator++();
-};
-#endif
-
-/*! \}
- */
-
-/*!
- *  \addtogroup memory_management_functions Memory Management Functions
- *  \ingroup memory_management
- *  \{
- */
-
-
-/*! \addtogroup allocation_functions
- *  \{
- */
-
-
 /*! This version of \p malloc allocates untyped uninitialized storage associated with a given system.
  *
  *  \param system The Thrust system with which to associate the storage.
@@ -281,7 +146,7 @@ template<typename Element, typename Pointer, typename Derived = thrust::use_defa
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
  *
- *  \pre \p DerivedPolicy must be publically derived from <code>thrust::execution_policy<DerivedPolicy></code>.
+ *  \pre \p DerivedPolicy must be publically derived from <tt>thrust::execution_policy<DerivedPolicy></tt>.
  *
  *  The following code snippet demonstrates how to use \p malloc to allocate a range of memory
  *  associated with Thrust's device system.
@@ -319,7 +184,7 @@ pointer<void,DerivedPolicy> malloc(const thrust::detail::execution_policy_base<D
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
  *
- *  \pre \p DerivedPolicy must be publically derived from <code>thrust::execution_policy<DerivedPolicy></code>.
+ *  \pre \p DerivedPolicy must be publically derived from <tt>thrust::execution_policy<DerivedPolicy></tt>.
  *
  *  The following code snippet demonstrates how to use \p malloc to allocate a range of memory
  *  to accomodate integers associated with Thrust's device system.
@@ -363,7 +228,7 @@ pointer<T,DerivedPolicy> malloc(const thrust::detail::execution_policy_base<Deri
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
  *
- *  \pre \p DerivedPolicy must be publically derived from <code>thrust::execution_policy<DerivedPolicy></code>.
+ *  \pre \p DerivedPolicy must be publically derived from <tt>thrust::execution_policy<DerivedPolicy></tt>.
  *
  *  The following code snippet demonstrates how to use \p get_temporary_buffer to allocate a range of memory
  *  to accomodate integers associated with Thrust's device system.
@@ -400,16 +265,6 @@ __host__ __device__
 thrust::pair<thrust::pointer<T,DerivedPolicy>, typename thrust::pointer<T,DerivedPolicy>::difference_type>
 get_temporary_buffer(const thrust::detail::execution_policy_base<DerivedPolicy> &system, typename thrust::pointer<T,DerivedPolicy>::difference_type n);
 
-
-/*! \} allocation_functions
- */
-
-
-/*! \addtogroup deallocation_functions
- *  \{
- */
-
-
 /*! \p free deallocates the storage previously allocated by \p thrust::malloc.
  *
  *  \param system The Thrust system with which the storage is associated.
@@ -486,11 +341,7 @@ void free(const thrust::detail::execution_policy_base<DerivedPolicy> &system, Po
  */
 template<typename DerivedPolicy, typename Pointer>
 __host__ __device__
-void return_temporary_buffer(const thrust::detail::execution_policy_base<DerivedPolicy> &system, Pointer p);
-
-
-/*! \} deallocation_functions
- */
+void return_temporary_buffer(const thrust::detail::execution_policy_base<DerivedPolicy> &system, Pointer p, std::ptrdiff_t n);
 
 
 /*! \p raw_pointer_cast creates a "raw" pointer from a pointer-like type,
@@ -539,9 +390,7 @@ __host__ __device__
 typename detail::raw_reference<const T>::type
   raw_reference_cast(const T &ref);
 
-
-/*! \}
+/*! \} // memory_management
  */
 
-} // end thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/merge.h b/thrust/merge.h
index 184141f6f..724f4c167 100644
--- a/thrust/merge.h
+++ b/thrust/merge.h
@@ -24,9 +24,7 @@
 #include <thrust/detail/execution_policy.h>
 #include <thrust/pair.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup merging Merging
  *  \ingroup algorithms
@@ -55,17 +53,17 @@ namespace thrust
  *  \return The end of the output range.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator2's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *
  *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
  *  \pre The resulting range shall not overlap with either input range.
@@ -90,7 +88,7 @@ namespace thrust
  *  // result = {1, 1, 1, 2, 3, 3, 5, 5, 7, 8, 9, 11, 13}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/merge.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/merge
  *  \see \p set_union
  *  \see \p sort
  *  \see \p is_sorted
@@ -125,17 +123,17 @@ __host__ __device__
  *  \param result The beginning of the merged output.
  *  \return The end of the output range.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator2's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *
  *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
  *  \pre The resulting range shall not overlap with either input range.
@@ -155,7 +153,7 @@ __host__ __device__
  *  // result = {1, 1, 1, 2, 3, 3, 5, 5, 7, 8, 9, 11, 13}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/merge.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/merge
  *  \see \p set_union
  *  \see \p sort
  *  \see \p is_sorted
@@ -192,14 +190,14 @@ template<typename InputIterator1,
  *  \return The end of the output range.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1's \c value_type is convertable to \p StrictWeakCompare's \c first_argument_type.
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2's \c value_type is convertable to \p StrictWeakCompare's \c second_argument_type.
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam StrictWeakCompare is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to \p comp.
  *  \pre The resulting range shall not overlap with either input range.
@@ -226,7 +224,7 @@ template<typename InputIterator1,
  *  // result = {13, 11, 9, 8, 7, 5, 5, 3, 3, 2, 1, 1, 1}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/merge.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/merge
  *  \see \p sort
  *  \see \p is_sorted
  */
@@ -263,14 +261,14 @@ __host__ __device__
  *  \param comp Comparison operator.
  *  \return The end of the output range.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1's \c value_type is convertable to \p StrictWeakCompare's \c first_argument_type.
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2's \c value_type is convertable to \p StrictWeakCompare's \c second_argument_type.
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam StrictWeakCompare is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to \p comp.
  *  \pre The resulting range shall not overlap with either input range.
@@ -292,7 +290,7 @@ __host__ __device__
  *  // result = {13, 11, 9, 8, 7, 5, 5, 3, 3, 2, 1, 1, 1}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/merge.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/merge
  *  \see \p sort
  *  \see \p is_sorted
  */
@@ -340,22 +338,22 @@ template<typename InputIterator1,
  *          and such that <tt>p.second</tt> is the end of the output range of values.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator2's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator3 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator4 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *
  *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
  *  \pre The resulting ranges shall not overlap with any input range.
@@ -432,22 +430,22 @@ __host__ __device__
  *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
  *          and such that <tt>p.second</tt> is the end of the output range of values.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator2's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator3 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator4 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *
  *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
  *  \pre The resulting ranges shall not overlap with any input range.
@@ -523,19 +521,19 @@ template<typename InputIterator1, typename InputIterator2, typename InputIterato
  *          and such that <tt>p.second</tt> is the end of the output range of values.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1's \c value_type is convertable to \p StrictWeakCompare's \c first_argument_type.
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator1's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2's \c value_type is convertable to \p StrictWeakCompare's \c second_argument_type.
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator1's set of \c value_types.
- *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator3 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator4 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam StrictWeakCompare is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to \p comp.
  *  \pre The resulting ranges shall not overlap with any input range.
@@ -617,19 +615,19 @@ __host__ __device__
  *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
  *          and such that <tt>p.second</tt> is the end of the output range of values.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1's \c value_type is convertable to \p StrictWeakCompare's \c first_argument_type.
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator1's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2's \c value_type is convertable to \p StrictWeakCompare's \c second_argument_type.
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator1's set of \c value_types.
- *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator3 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator4 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam StrictWeakCompare is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to \p comp.
  *  \pre The resulting ranges shall not overlap with any input range.
@@ -674,7 +672,6 @@ template<typename InputIterator1, typename InputIterator2, typename InputIterato
 /*! \} // merging
  */
 
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/merge.inl>
-
diff --git a/thrust/mismatch.h b/thrust/mismatch.h
index 413db84f5..bbdf2923a 100644
--- a/thrust/mismatch.h
+++ b/thrust/mismatch.h
@@ -25,9 +25,7 @@
 #include <thrust/detail/execution_policy.h>
 #include <thrust/pair.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup algorithms
  */
@@ -57,9 +55,9 @@ namespace thrust
  *  \return The first position where the sequences differ.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *          and \p InputIterator1's \c value_type is equality comparable to \p InputIterator2's \c value_type.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>.
  *
  *  \code
  *  #include <thrust/mismatch.h>
@@ -109,9 +107,9 @@ thrust::pair<InputIterator1, InputIterator2> mismatch(const thrust::detail::exec
  *  \param first2 The beginning of the second sequence.
  *  \return The first position where the sequences differ.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *          and \p InputIterator1's \c value_type is equality comparable to \p InputIterator2's \c value_type.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>.
  *
  *  \code
  *  #include <thrust/mismatch.h>
@@ -163,9 +161,9 @@ thrust::pair<InputIterator1, InputIterator2> mismatch(InputIterator1 first1,
  *  \return The first position where the sequences differ.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Input Iterator</a>.
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>.
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">Input Iterator</a>.
  *
  *  \code
  *  #include <thrust/mismatch.h>
@@ -217,9 +215,9 @@ thrust::pair<InputIterator1, InputIterator2> mismatch(const thrust::detail::exec
  *  \param pred   The binary predicate to compare elements.
  *  \return The first position where the sequences differ.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Input Iterator</a>.
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>.
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">Input Iterator</a>.
  *
  *  \code
  *  #include <thrust/mismatch.h>
@@ -254,7 +252,6 @@ thrust::pair<InputIterator1, InputIterator2> mismatch(InputIterator1 first1,
 /*! \} // end searching
  */
 
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/mismatch.inl>
-
diff --git a/thrust/mr/allocator.h b/thrust/mr/allocator.h
index b012fe85b..67adbe87c 100644
--- a/thrust/mr/allocator.h
+++ b/thrust/mr/allocator.h
@@ -14,27 +14,27 @@
  *  limitations under the License.
  */
 
-/*! \file allocator.h
- *  \brief Allocator types usable with NPA-based memory resources.
+/*! \file 
+ *  \brief Allocator types usable with \ref Memory Resources.
  */
 
 #pragma once
 
 #include <limits>
 
+#include <thrust/detail/config.h>
+#include <thrust/detail/config/exec_check_disable.h>
+#include <thrust/detail/config/memory_resource.h>
 #include <thrust/detail/type_traits/pointer_traits.h>
 
-#include <thrust/mr/detail/config.h>
 #include <thrust/mr/validator.h>
 #include <thrust/mr/polymorphic_adaptor.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace mr
 {
 
-/*! \addtogroup memory_management Memory Management
- *  \addtogroup memory_management_classes Memory Management Classes
+/*! \addtogroup allocators Allocators
  *  \ingroup memory_management
  *  \{
  */
@@ -59,7 +59,7 @@ class allocator : private validator<MR>
     typedef T value_type;
     /*! The pointer type allocated by this allocator. Equivaled to the pointer type of \p MR rebound to \p T. */
     typedef typename thrust::detail::pointer_traits<void_pointer>::template rebind<T>::other pointer;
-    /*! The pointer to const type. Equivalent to a pointer type of \p MR reboud to <tt>const T</tt>. */
+    /*! The pointer to const type. Equivalent to a pointer type of \p MR rebound to <tt>const T</tt>. */
     typedef typename thrust::detail::pointer_traits<void_pointer>::template rebind<const T>::other const_pointer;
     /*! The reference to the type allocated by this allocator. Supports smart references. */
     typedef typename thrust::detail::pointer_traits<pointer>::reference reference;
@@ -91,12 +91,13 @@ class allocator : private validator<MR>
 
     /*! Calculates the maximum number of elements allocated by this allocator.
      *
-     *  \returns the maximum value of \p std::size_t, divided by the size of \p T.
+     *  \return the maximum value of \p std::size_t, divided by the size of \p T.
      */
+    __thrust_exec_check_disable__
     __host__ __device__
     size_type max_size() const
     {
-        return std::numeric_limits<size_type>::max() / sizeof(T);
+        return (std::numeric_limits<size_type>::max)() / sizeof(T);
     }
 
     /*! Constructor.
@@ -118,7 +119,7 @@ class allocator : private validator<MR>
     /*! Allocates objects of type \p T.
      *
      *  \param n number of elements to allocate
-     *  \returns a pointer to the newly allocated storage.
+     *  \return a pointer to the newly allocated storage.
      */
     THRUST_NODISCARD
     __host__
@@ -140,7 +141,7 @@ class allocator : private validator<MR>
 
     /*! Extracts the memory resource used by this allocator.
      *
-     *  \returns the memory resource used by this allocator.
+     *  \return the memory resource used by this allocator.
      */
     __host__ __device__
     MR * resource() const
@@ -155,7 +156,7 @@ class allocator : private validator<MR>
 /*! Compares the allocators for equality by comparing the underlying memory resources. */
 template<typename T, typename MR>
 __host__ __device__
-bool operator==(const allocator<T, MR> & lhs, const allocator<T, MR> & rhs) THRUST_NOEXCEPT
+bool operator==(const allocator<T, MR> & lhs, const allocator<T, MR> & rhs) noexcept
 {
     return *lhs.resource() == *rhs.resource();
 }
@@ -163,17 +164,17 @@ bool operator==(const allocator<T, MR> & lhs, const allocator<T, MR> & rhs) THRU
 /*! Compares the allocators for inequality by comparing the underlying memory resources. */
 template<typename T, typename MR>
 __host__ __device__
-bool operator!=(const allocator<T, MR> & lhs, const allocator<T, MR> & rhs) THRUST_NOEXCEPT
+bool operator!=(const allocator<T, MR> & lhs, const allocator<T, MR> & rhs) noexcept
 {
     return !(lhs == rhs);
 }
 
-#if __cplusplus >= 201103L
+#if THRUST_CPP_DIALECT >= 2011
 
 template<typename T, typename Pointer>
 using polymorphic_allocator = allocator<T, polymorphic_adaptor_resource<Pointer> >;
 
-#else
+#else // C++11
 
 template<typename T, typename Pointer>
 class polymorphic_allocator : public allocator<T, polymorphic_adaptor_resource<Pointer> >
@@ -188,7 +189,7 @@ class polymorphic_allocator : public allocator<T, polymorphic_adaptor_resource<P
     }
 };
 
-#endif
+#endif // C++11
 
 /*! A helper allocator class that uses global instances of a given upstream memory resource. Requires the memory resource
  *      to be default constructible.
@@ -218,7 +219,8 @@ class stateless_resource_allocator : public thrust::mr::allocator<T, Upstream>
     /*! Default constructor. Uses \p get_global_resource to get the global instance of \p Upstream and initializes the
      *      \p allocator base subobject with that resource.
      */
-    __host__
+    __thrust_exec_check_disable__
+    __host__ __device__
     stateless_resource_allocator() : base(get_global_resource<Upstream>())
     {
     }
@@ -234,11 +236,18 @@ class stateless_resource_allocator : public thrust::mr::allocator<T, Upstream>
     stateless_resource_allocator(const stateless_resource_allocator<U, Upstream> & other)
         : base(other) {}
 
+#if THRUST_CPP_DIALECT >= 2011
+    stateless_resource_allocator & operator=(const stateless_resource_allocator &) = default;
+#endif
+
     /*! Destructor. */
     __host__ __device__
     ~stateless_resource_allocator() {}
 };
 
+/*! \} // allocators
+ */
+
 } // end mr
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/memory/detail/device_system_resource.h b/thrust/mr/device_memory_resource.h
similarity index 93%
rename from thrust/memory/detail/device_system_resource.h
rename to thrust/mr/device_memory_resource.h
index 9e94991d6..3a671142a 100644
--- a/thrust/memory/detail/device_system_resource.h
+++ b/thrust/mr/device_memory_resource.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2018 NVIDIA Corporation
+ *  Copyright 2018-2020 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -23,8 +23,7 @@
 #include __THRUST_DEVICE_SYSTEM_MEMORY_HEADER
 #undef __THRUST_DEVICE_SYSTEM_MEMORY_HEADER
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 
 typedef thrust::system::__THRUST_DEVICE_SYSTEM_NAMESPACE::memory_resource
@@ -35,5 +34,5 @@ typedef thrust::system::__THRUST_DEVICE_SYSTEM_NAMESPACE::universal_host_pinned_
     universal_host_pinned_memory_resource;
 
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/mr/disjoint_pool.h b/thrust/mr/disjoint_pool.h
index 9515e2fba..b00a8644c 100644
--- a/thrust/mr/disjoint_pool.h
+++ b/thrust/mr/disjoint_pool.h
@@ -14,14 +14,15 @@
  *  limitations under the License.
  */
 
-/*! \file disjoint_pool.h
+/*! \file 
  *  \brief A caching and pooling memory resource adaptor which uses separate upstream resources for memory allocation
  *      and bookkeeping.
  */
 
 #pragma once
 
-#include <algorithm>
+#include <thrust/detail/algorithm_wrapper.h>
+#include <thrust/detail/config.h>
 
 #include <thrust/host_vector.h>
 #include <thrust/binary_search.h>
@@ -33,13 +34,12 @@
 
 #include <cassert>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace mr
 {
 
 /** \addtogroup memory_resources Memory Resources
- *  \ingroup memory_management_classes
+ *  \ingroup memory_management
  *  \{
  */
 
@@ -68,7 +68,7 @@ namespace mr
  *  \tparam Bookkeeper the type of memory resources that will be used for allocating bookkeeping memory
  */
 template<typename Upstream, typename Bookkeeper>
-class disjoint_unsynchronized_pool_resource THRUST_FINAL
+class disjoint_unsynchronized_pool_resource final
     : public memory_resource<typename Upstream::pointer>,
         private validator2<Upstream, Bookkeeper>
 {
@@ -249,6 +249,10 @@ class disjoint_unsynchronized_pool_resource THRUST_FINAL
         {
         }
 
+#if THRUST_CPP_DIALECT >= 2011
+        pool & operator=(const pool &) = default;
+#endif
+
         __host__
         ~pool() {}
 
@@ -311,7 +315,7 @@ class disjoint_unsynchronized_pool_resource THRUST_FINAL
         m_cached_oversized.clear();
     }
 
-    THRUST_NODISCARD virtual void_ptr do_allocate(std::size_t bytes, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT) THRUST_OVERRIDE
+    THRUST_NODISCARD virtual void_ptr do_allocate(std::size_t bytes, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT) override
     {
         bytes = (std::max)(bytes, m_options.smallest_block_size);
         assert(detail::is_power_of_2(alignment));
@@ -438,7 +442,7 @@ class disjoint_unsynchronized_pool_resource THRUST_FINAL
         return ret;
     }
 
-    virtual void do_deallocate(void_ptr p, std::size_t n, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT) THRUST_OVERRIDE
+    virtual void do_deallocate(void_ptr p, std::size_t n, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT) override
     {
         n = (std::max)(n, m_options.smallest_block_size);
         assert(detail::is_power_of_2(alignment));
@@ -477,9 +481,9 @@ class disjoint_unsynchronized_pool_resource THRUST_FINAL
     }
 };
 
-/*! \}
+/*! \} // memory_resource
  */
 
 } // end mr
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/mr/disjoint_sync_pool.h b/thrust/mr/disjoint_sync_pool.h
index ed6cab7ed..ed81ae4cb 100644
--- a/thrust/mr/disjoint_sync_pool.h
+++ b/thrust/mr/disjoint_sync_pool.h
@@ -14,12 +14,13 @@
  *  limitations under the License.
  */
 
-/*! \file disjoint_sync_pool.h
+/*! \file 
  *  \brief A mutex-synchronized version of \p disjoint_unsynchronized_pool_resource.
  */
 
 #pragma once
 
+#include <thrust/detail/config.h>
 #include <thrust/detail/cpp11_required.h>
 
 #if THRUST_CPP_DIALECT >= 2011
@@ -28,15 +29,12 @@
 
 #include <thrust/mr/disjoint_pool.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace mr
 {
 
-/*! \addtogroup memory_management Memory Management
- *  \addtogroup memory_management_classes Memory Management Classes
- *  \addtogroup memory_resources Memory Resources
- *  \ingroup memory_resources
+/*! \addtogroup memory_resources Memory Resources
+ *  \ingroup memory_management
  *  \{
  */
 
@@ -92,13 +90,13 @@ struct disjoint_synchronized_pool_resource : public memory_resource<typename Ups
         upstream_pool.release();
     }
 
-    THRUST_NODISCARD virtual void_ptr do_allocate(std::size_t bytes, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT) THRUST_OVERRIDE
+    THRUST_NODISCARD virtual void_ptr do_allocate(std::size_t bytes, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT) override
     {
         lock_t lock(mtx);
         return upstream_pool.do_allocate(bytes, alignment);
     }
 
-    virtual void do_deallocate(void_ptr p, std::size_t n, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT) THRUST_OVERRIDE
+    virtual void do_deallocate(void_ptr p, std::size_t n, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT) override
     {
         lock_t lock(mtx);
         upstream_pool.do_deallocate(p, n, alignment);
@@ -109,11 +107,11 @@ struct disjoint_synchronized_pool_resource : public memory_resource<typename Ups
     unsync_pool upstream_pool;
 };
 
-/*! \}
+/*! \} // memory_resources
  */
 
 } // end mr
-} // end thrust
+THRUST_NAMESPACE_END
 
 #endif // THRUST_CPP_DIALECT >= 2011
 
diff --git a/thrust/mr/disjoint_tls_pool.h b/thrust/mr/disjoint_tls_pool.h
index 37c7e0993..9fc7917ca 100644
--- a/thrust/mr/disjoint_tls_pool.h
+++ b/thrust/mr/disjoint_tls_pool.h
@@ -20,14 +20,14 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
 #include <thrust/detail/cpp11_required.h>
 
 #if THRUST_CPP_DIALECT >= 2011
 
 #include <thrust/mr/disjoint_pool.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace mr
 {
 
@@ -46,7 +46,7 @@ namespace mr
  *  \param bookkeeper the second argument to the constructor, if invoked
  */
 template<typename Upstream, typename Bookkeeper>
-__host__ __device__
+__host__
 thrust::mr::disjoint_unsynchronized_pool_resource<Upstream, Bookkeeper> & tls_disjoint_pool(
     Upstream * upstream = NULL,
     Bookkeeper * bookkeeper = NULL)
@@ -63,7 +63,7 @@ thrust::mr::disjoint_unsynchronized_pool_resource<Upstream, Bookkeeper> & tls_di
  */
 
 } // end mr
-} // end thrust
+THRUST_NAMESPACE_END
 
 #endif // THRUST_CPP_DIALECT >= 2011
 
diff --git a/thrust/mr/fancy_pointer_resource.h b/thrust/mr/fancy_pointer_resource.h
index 53ffc7eb7..b88107564 100644
--- a/thrust/mr/fancy_pointer_resource.h
+++ b/thrust/mr/fancy_pointer_resource.h
@@ -16,18 +16,18 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
 #include <thrust/detail/type_traits/pointer_traits.h>
 
 #include <thrust/mr/memory_resource.h>
 #include <thrust/mr/validator.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace mr
 {
 
 template<typename Upstream, typename Pointer>
-class fancy_pointer_resource THRUST_FINAL : public memory_resource<Pointer>, private validator<Upstream>
+class fancy_pointer_resource final : public memory_resource<Pointer>, private validator<Upstream>
 {
 public:
     fancy_pointer_resource() : m_upstream(get_global_resource<Upstream>())
@@ -39,12 +39,12 @@ class fancy_pointer_resource THRUST_FINAL : public memory_resource<Pointer>, pri
     }
 
     THRUST_NODISCARD
-    virtual Pointer do_allocate(std::size_t bytes, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT) THRUST_OVERRIDE
+    virtual Pointer do_allocate(std::size_t bytes, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT) override
     {
         return static_cast<Pointer>(m_upstream->do_allocate(bytes, alignment));
     }
 
-    virtual void do_deallocate(Pointer p, std::size_t bytes, std::size_t alignment) THRUST_OVERRIDE
+    virtual void do_deallocate(Pointer p, std::size_t bytes, std::size_t alignment) override
     {
         return m_upstream->do_deallocate(
             static_cast<typename Upstream::pointer>(
@@ -57,5 +57,5 @@ class fancy_pointer_resource THRUST_FINAL : public memory_resource<Pointer>, pri
 };
 
 } // end mr
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/memory/detail/host_system_resource.h b/thrust/mr/host_memory_resource.h
similarity index 91%
rename from thrust/memory/detail/host_system_resource.h
rename to thrust/mr/host_memory_resource.h
index ded1c4d0b..9359a97a7 100644
--- a/thrust/memory/detail/host_system_resource.h
+++ b/thrust/mr/host_memory_resource.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2018 NVIDIA Corporation
+ *  Copyright 2018-2020 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -23,11 +23,10 @@
 #include __THRUST_HOST_SYSTEM_MEMORY_HEADER
 #undef __THRUST_HOST_SYSTEM_MEMORY_HEADER
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 typedef thrust::system::__THRUST_HOST_SYSTEM_NAMESPACE::memory_resource
     host_memory_resource;
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/mr/memory_resource.h b/thrust/mr/memory_resource.h
index 048ca2405..6af2f167c 100644
--- a/thrust/mr/memory_resource.h
+++ b/thrust/mr/memory_resource.h
@@ -14,27 +14,27 @@
  *  limitations under the License.
  */
 
-/*! \file mr/memory_resource.h
- *  \brief A base class for the memory resource system, similar to std::memory_resource,
- *      and related utilities.
+/*! \file
+ *  \brief A base class for the memory resource system, similar to
+ *  std::memory_resource, and related utilities.
  */
 
 #pragma once
 
-#include "detail/config.h"
+#include <thrust/detail/config.h>
+#include <thrust/detail/config/memory_resource.h>
 #ifdef THRUST_MR_STD_MR_HEADER
 #  include THRUST_MR_STD_MR_HEADER
 #endif
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 /*! \brief \p thrust::mr is the namespace containing system agnostic types and functions for \p memory_resource related functionalities.
  */
 namespace mr
 {
 
 /** \addtogroup memory_resources Memory Resources
- *  \ingroup memory_management_classes
+ *  \ingroup memory_management
  *  \{
  */
 
@@ -54,14 +54,14 @@ class memory_resource
 
     /*! Virtual destructor, defaulted when possible.
      */
-    virtual ~memory_resource() THRUST_DEFAULT
+    virtual ~memory_resource() = default;
 
     /*! Allocates memory of size at least \p bytes and alignment at least \p alignment.
      *
      *  \param bytes size, in bytes, that is requested from this allocation
      *  \param alignment alignment that is requested from this allocation
      *  \throws thrust::bad_alloc when no memory with requested size and alignment can be allocated.
-     *  \returns A pointer to void to the newly allocated memory.
+     *  \return A pointer to void to the newly allocated memory.
      */
     THRUST_NODISCARD
     pointer allocate(std::size_t bytes, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT)
@@ -86,10 +86,10 @@ class memory_resource
      *      which is often the right thing to do and doesn't require RTTI involvement.
      *
      *  \param other the other resource to compare this resource to
-     *  \returns whether the two resources are equivalent.
+     *  \return whether the two resources are equivalent.
      */
     __host__ __device__
-    bool is_equal(const memory_resource & other) const THRUST_NOEXCEPT
+    bool is_equal(const memory_resource & other) const noexcept
     {
         return do_is_equal(other);
     }
@@ -99,7 +99,7 @@ class memory_resource
      *  \param bytes size, in bytes, that is requested from this allocation
      *  \param alignment alignment that is requested from this allocation
      *  \throws thrust::bad_alloc when no memory with requested size and alignment can be allocated.
-     *  \returns A pointer to void to the newly allocated memory.
+     *  \return A pointer to void to the newly allocated memory.
      */
     virtual pointer do_allocate(std::size_t bytes, std::size_t alignment) = 0;
 
@@ -117,10 +117,10 @@ class memory_resource
      *      which is often the right thing to do and doesn't require RTTI involvement.
      *
      *  \param other the other resource to compare this resource to
-     *  \returns whether the two resources are equivalent.
+     *  \return whether the two resources are equivalent.
      */
     __host__ __device__
-    virtual bool do_is_equal(const memory_resource & other) const THRUST_NOEXCEPT
+    virtual bool do_is_equal(const memory_resource & other) const noexcept
     {
         return this == &other;
     }
@@ -135,7 +135,7 @@ class memory_resource<void *>
 public:
     typedef void * pointer;
 
-    virtual ~memory_resource() THRUST_DEFAULT
+    virtual ~memory_resource() = default;
 
     THRUST_NODISCARD
     pointer allocate(std::size_t bytes, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT)
@@ -149,7 +149,7 @@ class memory_resource<void *>
     }
 
     __host__ __device__
-    bool is_equal(const memory_resource & other) const THRUST_NOEXCEPT
+    bool is_equal(const memory_resource & other) const noexcept
     {
         return do_is_equal(other);
     }
@@ -157,7 +157,7 @@ class memory_resource<void *>
     virtual pointer do_allocate(std::size_t bytes, std::size_t alignment) = 0;
     virtual void do_deallocate(pointer p, std::size_t bytes, std::size_t alignment) = 0;
     __host__ __device__
-    virtual bool do_is_equal(const memory_resource & other) const THRUST_NOEXCEPT
+    virtual bool do_is_equal(const memory_resource & other) const noexcept
     {
         return this == &other;
     }
@@ -182,7 +182,7 @@ class memory_resource<void *>
  */
 template<typename Pointer>
 __host__ __device__
-bool operator==(const memory_resource<Pointer> & lhs, const memory_resource<Pointer> & rhs) THRUST_NOEXCEPT
+bool operator==(const memory_resource<Pointer> & lhs, const memory_resource<Pointer> & rhs) noexcept
 {
     return &lhs == &rhs || rhs.is_equal(rhs);
 }
@@ -191,7 +191,7 @@ bool operator==(const memory_resource<Pointer> & lhs, const memory_resource<Poin
  */
 template<typename Pointer>
 __host__ __device__
-bool operator!=(const memory_resource<Pointer> & lhs, const memory_resource<Pointer> & rhs) THRUST_NOEXCEPT
+bool operator!=(const memory_resource<Pointer> & lhs, const memory_resource<Pointer> & rhs) noexcept
 {
     return !(lhs == rhs);
 }
@@ -199,7 +199,7 @@ bool operator!=(const memory_resource<Pointer> & lhs, const memory_resource<Poin
 /*! Returns a global instance of \p MR, created as a function local static variable.
  *
  *  \tparam MR type of a memory resource to get an instance from. Must be \p DefaultConstructible.
- *  \returns a pointer to a global instance of \p MR.
+ *  \return a pointer to a global instance of \p MR.
  */
 template<typename MR>
 __host__
@@ -209,9 +209,9 @@ MR * get_global_resource()
     return &resource;
 }
 
-/*! \}
+/*! \} // memory_resource
  */
 
 } // end mr
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/mr/new.h b/thrust/mr/new.h
index d72b6f47b..644e25169 100644
--- a/thrust/mr/new.h
+++ b/thrust/mr/new.h
@@ -14,33 +14,34 @@
  *  limitations under the License.
  */
 
-/*! \file new.h
+/*! \file
  *  \brief Global operator new-based memory resource.
  */
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/mr/memory_resource.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace mr
 {
 
 /** \addtogroup memory_resources Memory Resources
- *  \ingroup memory_management_classes
+ *  \ingroup memory_management
  *  \{
  */
 
 /*! A memory resource that uses global operators new and delete to allocate and deallocate memory. Uses alignment-enabled
  *      overloads when available, otherwise uses regular overloads and implements alignment requirements by itself.
  */
-class new_delete_resource THRUST_FINAL : public memory_resource<>
+class new_delete_resource final : public memory_resource<>
 {
 public:
-    void * do_allocate(std::size_t bytes, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT) THRUST_OVERRIDE
+    void * do_allocate(std::size_t bytes, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT) override
     {
-#if __cplusplus >= 201703L
+#if defined(__cpp_aligned_new)
         return ::operator new(bytes, std::align_val_t(alignment));
 #else
         // allocate memory for bytes, plus potential alignment correction,
@@ -59,10 +60,15 @@ class new_delete_resource THRUST_FINAL : public memory_resource<>
 #endif
     }
 
-    void do_deallocate(void * p, std::size_t bytes, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT) THRUST_OVERRIDE
+    void do_deallocate(void * p, std::size_t bytes, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT) override
     {
-#if __cplusplus >= 201703L
+#if defined(__cpp_aligned_new)
+# if defined(__cpp_sized_deallocation)
         ::operator delete(p, bytes, std::align_val_t(alignment));
+# else
+        (void)bytes;
+        ::operator delete(p, std::align_val_t(alignment));
+# endif
 #else
         (void)alignment;
         char * ptr = static_cast<char *>(p);
@@ -75,9 +81,9 @@ class new_delete_resource THRUST_FINAL : public memory_resource<>
     }
 };
 
-/*! \}
+/*! \} // memory_resources
  */
 
 } // end mr
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/mr/polymorphic_adaptor.h b/thrust/mr/polymorphic_adaptor.h
index d5d98bf83..0562a8f82 100644
--- a/thrust/mr/polymorphic_adaptor.h
+++ b/thrust/mr/polymorphic_adaptor.h
@@ -16,33 +16,34 @@
 
 #pragma once
 
-#include "memory_resource.h"
+#include <thrust/detail/config.h>
 
-namespace thrust
-{
+#include <thrust/mr/memory_resource.h>
+
+THRUST_NAMESPACE_BEGIN
 namespace mr
 {
 
 template<typename Pointer = void *>
-class polymorphic_adaptor_resource THRUST_FINAL : public memory_resource<Pointer>
+class polymorphic_adaptor_resource final : public memory_resource<Pointer>
 {
 public:
     polymorphic_adaptor_resource(memory_resource<Pointer> * t) : upstream_resource(t)
     {
     }
 
-    virtual Pointer do_allocate(std::size_t bytes, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT) THRUST_OVERRIDE
+    virtual Pointer do_allocate(std::size_t bytes, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT) override
     {
         return upstream_resource->allocate(bytes, alignment);
     }
 
-    virtual void do_deallocate(Pointer p, std::size_t bytes, std::size_t alignment) THRUST_OVERRIDE
+    virtual void do_deallocate(Pointer p, std::size_t bytes, std::size_t alignment) override
     {
         return upstream_resource->deallocate(p, bytes, alignment);
     }
 
     __host__ __device__
-    virtual bool do_is_equal(const memory_resource<Pointer> & other) const THRUST_NOEXCEPT THRUST_OVERRIDE
+    virtual bool do_is_equal(const memory_resource<Pointer> & other) const noexcept override
     {
         return upstream_resource->is_equal(other);
     }
@@ -52,5 +53,5 @@ class polymorphic_adaptor_resource THRUST_FINAL : public memory_resource<Pointer
 };
 
 } // end mr
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/mr/pool.h b/thrust/mr/pool.h
index 4e311f5b3..6259a23f1 100644
--- a/thrust/mr/pool.h
+++ b/thrust/mr/pool.h
@@ -14,14 +14,17 @@
  *  limitations under the License.
  */
 
-/*! \file pool.h
- *  \brief A caching and pooling memory resource adaptor which uses a single upstream resource for memory allocation,
- *      and embeds bookkeeping information in allocated blocks.
+/*! \file 
+ *  \brief A caching and pooling memory resource adaptor which uses a single
+ *  upstream resource for memory allocation, and embeds bookkeeping information
+ *  in allocated blocks.
  */
 
 #pragma once
 
-#include <algorithm>
+#include <thrust/detail/config.h>
+
+#include <thrust/detail/algorithm_wrapper.h>
 
 #include <thrust/host_vector.h>
 
@@ -31,13 +34,12 @@
 
 #include <cassert>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace mr
 {
 
 /** \addtogroup memory_resources Memory Resources
- *  \ingroup memory_management_classes
+ *  \ingroup memory_management
  *  \{
  */
 
@@ -63,7 +65,7 @@ namespace mr
  *  \tparam Upstream the type of memory resources that will be used for allocating memory blocks
  */
 template<typename Upstream>
-class unsynchronized_pool_resource THRUST_FINAL
+class unsynchronized_pool_resource final
     : public memory_resource<typename Upstream::pointer>,
         private validator<Upstream>
 {
@@ -250,7 +252,7 @@ class unsynchronized_pool_resource THRUST_FINAL
         m_cached_oversized = oversized_block_descriptor_ptr();
     }
 
-    THRUST_NODISCARD virtual void_ptr do_allocate(std::size_t bytes, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT) THRUST_OVERRIDE
+    THRUST_NODISCARD virtual void_ptr do_allocate(std::size_t bytes, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT) override
     {
         bytes = (std::max)(bytes, m_options.smallest_block_size);
         assert(detail::is_power_of_2(alignment));
@@ -392,10 +394,10 @@ class unsynchronized_pool_resource THRUST_FINAL
                 )
             );
 
-            chunk_descriptor desc;
-            desc.size = chunk_size;
-            desc.next = m_allocated;
-            *chunk = desc;
+            chunk_descriptor chunk_desc;
+            chunk_desc.size = chunk_size;
+            chunk_desc.next = m_allocated;
+            *chunk = chunk_desc;
             m_allocated = chunk;
 
             for (std::size_t i = 0; i < n; ++i)
@@ -406,9 +408,9 @@ class unsynchronized_pool_resource THRUST_FINAL
                     )
                 );
 
-                block_descriptor desc;
-                desc.next = bucket.free_list;
-                *block = desc;
+                block_descriptor block_desc;
+                block_desc.next = bucket.free_list;
+                *block = block_desc;
                 bucket.free_list = block;
             }
         }
@@ -423,7 +425,7 @@ class unsynchronized_pool_resource THRUST_FINAL
         );
     }
 
-    virtual void do_deallocate(void_ptr p, std::size_t n, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT) THRUST_OVERRIDE
+    virtual void do_deallocate(void_ptr p, std::size_t n, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT) override
     {
         n = (std::max)(n, m_options.smallest_block_size);
         assert(detail::is_power_of_2(alignment));
@@ -497,9 +499,9 @@ class unsynchronized_pool_resource THRUST_FINAL
     }
 };
 
-/*! \}
+/*! \} // memory_resources
  */
 
 } // end mr
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/mr/pool_options.h b/thrust/mr/pool_options.h
index 60430b7d2..13a8fe674 100644
--- a/thrust/mr/pool_options.h
+++ b/thrust/mr/pool_options.h
@@ -14,8 +14,9 @@
  *  limitations under the License.
  */
 
-/*! \file pool_options.h
- *  \brief \p pool_options is a type used by the pooling resource adaptors to fine-tune their behavior.
+/*! \file 
+ *  \brief A type used by the pooling resource adaptors to fine-tune their
+ *  behavior.
  */
 
 #pragma once
@@ -24,14 +25,14 @@
 
 #include <thrust/detail/integer_math.h>
 
-#include <thrust/mr/detail/config.h>
+#include <thrust/detail/config.h>
+#include <thrust/detail/config/memory_resource.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace mr
 {
 
-/*! \addtogroup memory_management_classes Memory Management Classes
+/*! \addtogroup memory_resources Memory Resources
  *  \ingroup memory_management
  *  \{
  */
@@ -119,9 +120,9 @@ struct pool_options
     }
 };
 
-/*! \}
+/*! \} // memory_resources
  */
 
 } // end mr
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/mr/sync_pool.h b/thrust/mr/sync_pool.h
index 9cf8640ca..46c0e8441 100644
--- a/thrust/mr/sync_pool.h
+++ b/thrust/mr/sync_pool.h
@@ -14,12 +14,13 @@
  *  limitations under the License.
  */
 
-/*! \file sync_pool.h
+/*! \file 
  *  \brief A mutex-synchronized version of \p unsynchronized_pool_resource.
  */
 
 #pragma once
 
+#include <thrust/detail/config.h>
 #include <thrust/detail/cpp11_required.h>
 
 #if THRUST_CPP_DIALECT >= 2011
@@ -28,15 +29,12 @@
 
 #include <thrust/mr/pool.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace mr
 {
 
-/*! \addtogroup memory_management Memory Management
- *  \addtogroup memory_management_classes Memory Management Classes
- *  \addtogroup memory_resources Memory Resources
- *  \ingroup memory_resources
+/*! \addtogroup memory_resources Memory Resources
+ *  \ingroup memory_management
  *  \{
  */
 
@@ -89,13 +87,13 @@ struct synchronized_pool_resource : public memory_resource<typename Upstream::po
         upstream_pool.release();
     }
 
-    THRUST_NODISCARD virtual void_ptr do_allocate(std::size_t bytes, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT) THRUST_OVERRIDE
+    THRUST_NODISCARD virtual void_ptr do_allocate(std::size_t bytes, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT) override
     {
         lock_t lock(mtx);
         return upstream_pool.do_allocate(bytes, alignment);
     }
 
-    virtual void do_deallocate(void_ptr p, std::size_t n, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT) THRUST_OVERRIDE
+    virtual void do_deallocate(void_ptr p, std::size_t n, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT) override
     {
         lock_t lock(mtx);
         upstream_pool.do_deallocate(p, n, alignment);
@@ -106,11 +104,11 @@ struct synchronized_pool_resource : public memory_resource<typename Upstream::po
     unsync_pool upstream_pool;
 };
 
-/*! \}
+/*! \} // memory_resources
  */
 
 } // end mr
-} // end thrust
+THRUST_NAMESPACE_END
 
 #endif // THRUST_CPP_DIALECT >= 2011
 
diff --git a/thrust/mr/tls_pool.h b/thrust/mr/tls_pool.h
index 381917fd5..8ee8127a3 100644
--- a/thrust/mr/tls_pool.h
+++ b/thrust/mr/tls_pool.h
@@ -20,14 +20,14 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
 #include <thrust/detail/cpp11_required.h>
 
 #if THRUST_CPP_DIALECT >= 2011
 
 #include <thrust/mr/pool.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace mr
 {
 
@@ -43,7 +43,7 @@ namespace mr
  *  \param upstream the argument to the constructor, if invoked
  */
 template<typename Upstream, typename Bookkeeper>
-__host__ __device__
+__host__
 thrust::mr::unsynchronized_pool_resource<Upstream> & tls_pool(Upstream * upstream = NULL)
 {
     static thread_local auto adaptor = [&]{
@@ -58,7 +58,7 @@ thrust::mr::unsynchronized_pool_resource<Upstream> & tls_pool(Upstream * upstrea
  */
 
 } // end mr
-} // end thrust
+THRUST_NAMESPACE_END
 
 #endif // THRUST_CPP_DIALECT >= 2011
 
diff --git a/thrust/detail/util/blocking.h b/thrust/mr/universal_memory_resource.h
similarity index 73%
rename from thrust/detail/util/blocking.h
rename to thrust/mr/universal_memory_resource.h
index 747d9b97b..b7f1ebd6f 100644
--- a/thrust/detail/util/blocking.h
+++ b/thrust/mr/universal_memory_resource.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2018-2020 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -14,24 +14,9 @@
  *  limitations under the License.
  */
 
-
 #pragma once
 
-//functions to support blocking
-
-namespace thrust
-{
-
-namespace detail
-{
-
-namespace util
-{
-
-
-} // end namespace util
-
-} // end namespace detail
+#include <thrust/detail/config.h>
 
-} // end namespace thrust
+#include <thrust/mr/device_memory_resource.h>
 
diff --git a/thrust/mr/validator.h b/thrust/mr/validator.h
index 7f7e12c76..10e964821 100644
--- a/thrust/mr/validator.h
+++ b/thrust/mr/validator.h
@@ -16,18 +16,19 @@
 
 #pragma once
 
-#include "detail/config.h"
-#include "memory_resource.h"
+#include <thrust/detail/config.h>
 
-namespace thrust
-{
+#include <thrust/detail/config/memory_resource.h>
+#include <thrust/mr/memory_resource.h>
+
+THRUST_NAMESPACE_BEGIN
 namespace mr
 {
 
 template<typename MR>
 struct validator
 {
-#if __cplusplus >= 201103L
+#if THRUST_CPP_DIALECT >= 2011
   static_assert(
     std::is_base_of<memory_resource<typename MR::pointer>, MR>::value,
     "a type used as a memory resource must derive from memory_resource"
@@ -46,5 +47,5 @@ struct validator2<T, T> : private validator<T>
 };
 
 } // end mr
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/optional.h b/thrust/optional.h
index 94d10d902..a1ca4f465 100644
--- a/thrust/optional.h
+++ b/thrust/optional.h
@@ -1,6 +1,6 @@
 ///
 // optional - An implementation of std::optional with extensions
-// Written in 2017 by Simon Brand (@TartanLlama)
+// Written in 2017 by Sy Brand (@TartanLlama)
 //
 // To the extent possible under law, the author(s) have dedicated all
 // copyright and related and neighboring rights to this software to the
@@ -15,6 +15,7 @@
 
 #include <thrust/detail/config.h>
 #include <thrust/detail/cpp11_required.h>
+#include <thrust/detail/type_traits.h>
 
 #if THRUST_CPP_DIALECT >= 2011
 
@@ -30,7 +31,7 @@
 #include <type_traits>
 #include <utility>
 
-#if (defined(_MSC_VER) && _MSC_VER == 1900)
+#if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC && _MSC_VER == 1900)
 #define THRUST_OPTIONAL_MSVC2015
 #endif
 
@@ -59,6 +60,11 @@
   std::has_trivial_copy_constructor<T>::value
 #define THRUST_OPTIONAL_IS_TRIVIALLY_COPY_ASSIGNABLE(T) std::has_trivial_copy_assign<T>::value
 
+// GCC < 5 doesn't provide a way to emulate std::is_trivially_move_*,
+// so don't enable any optimizations that rely on them:
+#define THRUST_OPTIONAL_IS_TRIVIALLY_MOVE_CONSTRUCTIBLE(T) false
+#define THRUST_OPTIONAL_IS_TRIVIALLY_MOVE_ASSIGNABLE(T) false
+
 // This one will be different for GCC 5.7 if it's ever supported
 #define THRUST_OPTIONAL_IS_TRIVIALLY_DESTRUCTIBLE(T) std::is_trivially_destructible<T>::value
 
@@ -68,7 +74,7 @@
      !defined(__clang__))
 #ifndef THRUST_GCC_LESS_8_TRIVIALLY_COPY_CONSTRUCTIBLE_MUTEX
 #define THRUST_GCC_LESS_8_TRIVIALLY_COPY_CONSTRUCTIBLE_MUTEX
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
   namespace detail {
       template<class T>
       struct is_trivially_copy_constructible : std::is_trivially_copy_constructible<T>{};
@@ -76,30 +82,79 @@ THRUST_BEGIN_NS
       template<class T, class A>
       struct is_trivially_copy_constructible<std::vector<T,A>>
           : std::is_trivially_copy_constructible<T>{};
-#endif      
+#endif
   }
-THRUST_END_NS
+THRUST_NAMESPACE_END
 #endif
 
 #define THRUST_OPTIONAL_IS_TRIVIALLY_COPY_CONSTRUCTIBLE(T)                                     \
     thrust::detail::is_trivially_copy_constructible<T>::value
 #define THRUST_OPTIONAL_IS_TRIVIALLY_COPY_ASSIGNABLE(T)                                        \
   std::is_trivially_copy_assignable<T>::value
+#define THRUST_OPTIONAL_IS_TRIVIALLY_MOVE_CONSTRUCTIBLE(T)                                     \
+  std::is_trivially_move_constructible<T>::value
+#define THRUST_OPTIONAL_IS_TRIVIALLY_MOVE_ASSIGNABLE(T)                                        \
+  std::is_trivially_move_assignable<T>::value
 #define THRUST_OPTIONAL_IS_TRIVIALLY_DESTRUCTIBLE(T) std::is_trivially_destructible<T>::value
 #else
-#define THRUST_OPTIONAL_IS_TRIVIALLY_COPY_CONSTRUCTIBLE(T)                                     \
+
+// To support clang + old libstdc++ without type traits, check for equivalent
+// clang built-ins and use them if present. See note above
+// is_trivially_copyable_impl in
+// thrust/type_traits/is_trivially_relocatable.h for more details.
+
+#ifndef __has_feature
+#define __has_feature(x) 0
+#endif
+
+#if defined(__GLIBCXX__) && __has_feature(is_trivially_constructible)
+#define THRUST_OPTIONAL_IS_TRIVIALLY_COPY_CONSTRUCTIBLE(T) \
+  __is_trivially_constructible(T, T const&)
+#else
+#define THRUST_OPTIONAL_IS_TRIVIALLY_COPY_CONSTRUCTIBLE(T) \
   std::is_trivially_copy_constructible<T>::value
-#define THRUST_OPTIONAL_IS_TRIVIALLY_COPY_ASSIGNABLE(T)                                        \
+#endif
+
+#if defined(__GLIBCXX__) && __has_feature(is_trivially_assignable)
+#define THRUST_OPTIONAL_IS_TRIVIALLY_COPY_ASSIGNABLE(T) \
+  __is_trivially_assignable(T&, T const&)
+#else
+#define THRUST_OPTIONAL_IS_TRIVIALLY_COPY_ASSIGNABLE(T) \
   std::is_trivially_copy_assignable<T>::value
-#define THRUST_OPTIONAL_IS_TRIVIALLY_DESTRUCTIBLE(T) std::is_trivially_destructible<T>::value
 #endif
 
-#if __cplusplus > 201103L
+#if defined(__GLIBCXX__) && __has_feature(is_trivially_constructible)
+#define THRUST_OPTIONAL_IS_TRIVIALLY_MOVE_CONSTRUCTIBLE(T) \
+  __is_trivially_constructible(T, T&&)
+#else
+#define THRUST_OPTIONAL_IS_TRIVIALLY_MOVE_CONSTRUCTIBLE(T) \
+  std::is_trivially_move_constructible<T>::value
+#endif
+
+#if defined(__GLIBCXX__) && __has_feature(is_trivially_assignable)
+#define THRUST_OPTIONAL_IS_TRIVIALLY_MOVE_ASSIGNABLE(T) \
+  __is_trivially_assignable(T&, T&&)
+#else
+#define THRUST_OPTIONAL_IS_TRIVIALLY_MOVE_ASSIGNABLE(T) \
+  std::is_trivially_move_assignable<T>::value
+#endif
+
+#if defined(__GLIBCXX__) && __has_feature(is_trivially_destructible)
+#define THRUST_OPTIONAL_IS_TRIVIALLY_DESTRUCTIBLE(T) \
+  __is_trivially_destructible(T)
+#else
+#define THRUST_OPTIONAL_IS_TRIVIALLY_DESTRUCTIBLE(T) \
+  std::is_trivially_destructible<T>::value
+#endif
+
+#endif
+
+#if THRUST_CPP_DIALECT > 2011
 #define THRUST_OPTIONAL_CPP14
 #endif
 
 // constexpr implies const in C++11, not C++14
-#if (__cplusplus == 201103L || defined(THRUST_OPTIONAL_MSVC2015) ||                \
+#if (THRUST_CPP_DIALECT == 2011 || defined(THRUST_OPTIONAL_MSVC2015) ||                \
      defined(THRUST_OPTIONAL_GCC49))
 /// \exclude
 #define THRUST_OPTIONAL_CPP11_CONSTEXPR
@@ -108,7 +163,8 @@ THRUST_END_NS
 #define THRUST_OPTIONAL_CPP11_CONSTEXPR constexpr
 #endif
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
+
 #ifndef THRUST_MONOSTATE_INPLACE_MUTEX
 #define THRUST_MONOSTATE_INPLACE_MUTEX
 /// \brief Used to represent an optional with no data; essentially a bool
@@ -145,7 +201,7 @@ template <class B, class... Bs>
 struct conjunction<B, Bs...>
     : std::conditional<bool(B::value), conjunction<Bs...>, B>::type {};
 
-#if defined(_LIBCPP_VERSION) && __cplusplus == 201103L
+#if defined(_LIBCPP_VERSION) && THRUST_CPP_DIALECT == 2011
 #define THRUST_OPTIONAL_LIBCXX_MEM_FN_WORKAROUND
 #endif
 
@@ -159,17 +215,17 @@ struct is_pointer_to_non_const_member_func<Ret (T::*) (Args...)> : std::true_typ
 template <class T, class Ret, class... Args>
 struct is_pointer_to_non_const_member_func<Ret (T::*) (Args...)&> : std::true_type{};
 template <class T, class Ret, class... Args>
-struct is_pointer_to_non_const_member_func<Ret (T::*) (Args...)&&> : std::true_type{};        
+struct is_pointer_to_non_const_member_func<Ret (T::*) (Args...)&&> : std::true_type{};
 template <class T, class Ret, class... Args>
 struct is_pointer_to_non_const_member_func<Ret (T::*) (Args...) volatile> : std::true_type{};
 template <class T, class Ret, class... Args>
 struct is_pointer_to_non_const_member_func<Ret (T::*) (Args...) volatile&> : std::true_type{};
 template <class T, class Ret, class... Args>
-struct is_pointer_to_non_const_member_func<Ret (T::*) (Args...) volatile&&> : std::true_type{};        
+struct is_pointer_to_non_const_member_func<Ret (T::*) (Args...) volatile&&> : std::true_type{};
 
 template <class T> struct is_const_or_const_ref : std::false_type{};
 template <class T> struct is_const_or_const_ref<T const&> : std::true_type{};
-template <class T> struct is_const_or_const_ref<T const> : std::true_type{};    
+template <class T> struct is_const_or_const_ref<T const> : std::true_type{};
 #endif
 
 // std::invoke from C++17
@@ -177,15 +233,16 @@ template <class T> struct is_const_or_const_ref<T const> : std::true_type{};
 __thrust_exec_check_disable__
 template <typename Fn, typename... Args,
 #ifdef THRUST_OPTIONAL_LIBCXX_MEM_FN_WORKAROUND
-          typename = enable_if_t<!(is_pointer_to_non_const_member_func<Fn>::value 
-                                 && is_const_or_const_ref<Args...>::value)>, 
+          typename = enable_if_t<!(is_pointer_to_non_const_member_func<Fn>::value
+                                 && is_const_or_const_ref<Args...>::value)>,
 #endif
           typename = enable_if_t<std::is_member_pointer<decay_t<Fn>>::value>,
           int = 0>
 __host__ __device__
-constexpr auto invoke(Fn &&f, Args &&... args) noexcept(
-    noexcept(std::mem_fn(f)(std::forward<Args>(args)...)))
-    -> decltype(std::mem_fn(f)(std::forward<Args>(args)...)) {
+constexpr auto invoke(Fn &&f, Args &&... args)
+  noexcept(noexcept(std::mem_fn(f)(std::forward<Args>(args)...)))
+  THRUST_TRAILING_RETURN(decltype(std::mem_fn(f)(std::forward<Args>(args)...)))
+{
   return std::mem_fn(f)(std::forward<Args>(args)...);
 }
 
@@ -193,27 +250,12 @@ __thrust_exec_check_disable__
 template <typename Fn, typename... Args,
           typename = enable_if_t<!std::is_member_pointer<decay_t<Fn>>::value>>
 __host__ __device__
-constexpr auto invoke(Fn &&f, Args &&... args) noexcept(
-    noexcept(std::forward<Fn>(f)(std::forward<Args>(args)...)))
-    -> decltype(std::forward<Fn>(f)(std::forward<Args>(args)...)) {
+constexpr auto invoke(Fn &&f, Args &&... args)
+  noexcept(noexcept(std::forward<Fn>(f)(std::forward<Args>(args)...)))
+  THRUST_TRAILING_RETURN(decltype(std::forward<Fn>(f)(std::forward<Args>(args)...)))
+{
   return std::forward<Fn>(f)(std::forward<Args>(args)...);
 }
-
-// std::invoke_result from C++17
-template <class F, class, class... Us> struct invoke_result_impl;
-
-template <class F, class... Us>
-struct invoke_result_impl<
-    F, decltype(detail::invoke(std::declval<F>(), std::declval<Us>()...), void()),
-    Us...> {
-  using type = decltype(detail::invoke(std::declval<F>(), std::declval<Us>()...));
-};
-
-template <class F, class... Us>
-using invoke_result = invoke_result_impl<F, void, Us...>;
-
-template <class F, class... Us>
-using invoke_result_t = typename invoke_result<F, Us...>::type;
 #endif
 
 // std::void_t from C++17
@@ -288,7 +330,7 @@ using enable_assign_from_other = detail::enable_if_t<
     !std::is_assignable<T &, const optional<U> &>::value &&
     !std::is_assignable<T &, const optional<U> &&>::value>;
 
-#ifdef _MSC_VER
+#if THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC
 // TODO make a version which works with MSVC
 template <class T, class U = T> struct is_swappable : std::true_type {};
 
@@ -435,7 +477,7 @@ template <class T> struct optional_operations_base : optional_storage_base<T> {
   template <class... Args>
   __host__ __device__
   void construct(Args &&... args) noexcept {
-    new (addressof(this->m_value)) T(std::forward<Args>(args)...);
+    new (thrust::addressof(this->m_value)) T(std::forward<Args>(args)...);
     this->m_has_value = true;
   }
 
@@ -509,19 +551,10 @@ struct optional_copy_base<T, false> : optional_operations_base<T> {
   optional_copy_base &operator=(optional_copy_base &&rhs) = default;
 };
 
-// This class manages conditionally having a trivial move constructor
-// Unfortunately there's no way to achieve this in GCC < 5 AFAIK, since it
-// doesn't implement an analogue to std::is_trivially_move_constructible. We
-// have to make do with a non-trivial move constructor even if T is trivially
-// move constructible
-#ifndef THRUST_OPTIONAL_GCC49
-template <class T, bool = std::is_trivially_move_constructible<T>::value>
+template <class T, bool = THRUST_OPTIONAL_IS_TRIVIALLY_MOVE_CONSTRUCTIBLE(T)>
 struct optional_move_base : optional_copy_base<T> {
   using optional_copy_base<T>::optional_copy_base;
 };
-#else
-template <class T, bool = false> struct optional_move_base;
-#endif
 template <class T> struct optional_move_base<T, false> : optional_copy_base<T> {
   using optional_copy_base<T>::optional_copy_base;
 
@@ -576,21 +609,13 @@ struct optional_copy_assign_base<T, false> : optional_move_base<T> {
   operator=(optional_copy_assign_base &&rhs) = default;
 };
 
-// This class manages conditionally having a trivial move assignment operator
-// Unfortunately there's no way to achieve this in GCC < 5 AFAIK, since it
-// doesn't implement an analogue to std::is_trivially_move_assignable. We have
-// to make do with a non-trivial move assignment operator even if T is trivially
-// move assignable
-#ifndef THRUST_OPTIONAL_GCC49
-template <class T, bool = std::is_trivially_destructible<T>::value
-                       &&std::is_trivially_move_constructible<T>::value
-                           &&std::is_trivially_move_assignable<T>::value>
+template <class T,
+          bool = THRUST_OPTIONAL_IS_TRIVIALLY_DESTRUCTIBLE(T) &&
+                 THRUST_OPTIONAL_IS_TRIVIALLY_MOVE_CONSTRUCTIBLE(T) &&
+                 THRUST_OPTIONAL_IS_TRIVIALLY_MOVE_ASSIGNABLE(T)>
 struct optional_move_assign_base : optional_copy_assign_base<T> {
   using optional_copy_assign_base<T>::optional_copy_assign_base;
 };
-#else
-template <class T, bool = false> struct optional_move_assign_base;
-#endif
 
 template <class T>
 struct optional_move_assign_base<T, false> : optional_copy_assign_base<T> {
@@ -802,13 +827,13 @@ class optional : private detail::optional_move_assign_base<T>,
 // The different versions for C++14 and 11 are needed because deduced return
 // types are not SFINAE-safe. This provides better support for things like
 // generic lambdas. C.f.
-// http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2017/p0826r0.html
+// http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2017/p0826r0
 #if defined(THRUST_OPTIONAL_CPP14) && !defined(THRUST_OPTIONAL_GCC49) &&               \
     !defined(THRUST_OPTIONAL_GCC54) && !defined(THRUST_OPTIONAL_GCC55)
   /// \group and_then
   /// Carries out some operation which returns an optional on the stored
   /// object if there is one. \requires `std::invoke(std::forward<F>(f),
-  /// value())` returns a `std::optional<U>` for some `U`. \returns Let `U` be
+  /// value())` returns a `std::optional<U>` for some `U`. \return Let `U` be
   /// the result of `std::invoke(std::forward<F>(f), value())`. Returns a
   /// `std::optional<U>`. The return value is empty if `*this` is empty,
   /// otherwise the return value of `std::invoke(std::forward<F>(f), value())`
@@ -875,7 +900,7 @@ class optional : private detail::optional_move_assign_base<T>,
   /// Carries out some operation which returns an optional on the stored
   /// object if there is one. \requires `std::invoke(std::forward<F>(f),
   /// value())` returns a `std::optional<U>` for some `U`.
-  /// \returns Let `U` be the result of `std::invoke(std::forward<F>(f),
+  /// \return Let `U` be the result of `std::invoke(std::forward<F>(f),
   /// value())`. Returns a `std::optional<U>`. The return value is empty if
   /// `*this` is empty, otherwise the return value of
   /// `std::invoke(std::forward<F>(f), value())` is returned.
@@ -941,7 +966,7 @@ class optional : private detail::optional_move_assign_base<T>,
 #if defined(THRUST_OPTIONAL_CPP14) && !defined(THRUST_OPTIONAL_GCC49) &&               \
     !defined(THRUST_OPTIONAL_GCC54) && !defined(THRUST_OPTIONAL_GCC55)
   /// \brief Carries out some operation on the stored object if there is one.
-  /// \returns Let `U` be the result of `std::invoke(std::forward<F>(f),
+  /// \return Let `U` be the result of `std::invoke(std::forward<F>(f),
   /// value())`. Returns a `std::optional<U>`. The return value is empty if
   /// `*this` is empty, otherwise an `optional<U>` is constructed from the
   /// return value of `std::invoke(std::forward<F>(f), value())` and is
@@ -984,7 +1009,7 @@ class optional : private detail::optional_move_assign_base<T>,
   }
 #else
   /// \brief Carries out some operation on the stored object if there is one.
-  /// \returns Let `U` be the result of `std::invoke(std::forward<F>(f),
+  /// \return Let `U` be the result of `std::invoke(std::forward<F>(f),
   /// value())`. Returns a `std::optional<U>`. The return value is empty if
   /// `*this` is empty, otherwise an `optional<U>` is constructed from the
   /// return value of `std::invoke(std::forward<F>(f), value())` and is
@@ -1225,7 +1250,7 @@ class optional : private detail::optional_move_assign_base<T>,
   }
 #endif
 
-  /// \returns `u` if `*this` has a value, otherwise an empty optional.
+  /// \return `u` if `*this` has a value, otherwise an empty optional.
   __thrust_exec_check_disable__
   template <class U>
   __host__ __device__
@@ -1234,7 +1259,7 @@ class optional : private detail::optional_move_assign_base<T>,
     return has_value() ? result{u} : result{nullopt};
   }
 
-  /// \returns `rhs` if `*this` is empty, otherwise the current value.
+  /// \return `rhs` if `*this` is empty, otherwise the current value.
   /// \group disjunction
   __thrust_exec_check_disable__
   __host__ __device__
@@ -1555,7 +1580,7 @@ class optional : private detail::optional_move_assign_base<T>,
 
     *this = nullopt;
     this->construct(std::forward<Args>(args)...);
-    return value();
+    return this->m_value;
   }
 
   /// \group emplace
@@ -1569,7 +1594,7 @@ class optional : private detail::optional_move_assign_base<T>,
   emplace(std::initializer_list<U> il, Args &&... args) {
     *this = nullopt;
     this->construct(il, std::forward<Args>(args)...);
-    return value();    
+    return this->m_value;
   }
 
   /// Swaps this optional with the other.
@@ -1597,7 +1622,7 @@ class optional : private detail::optional_move_assign_base<T>,
     }
   }
 
-  /// \returns a pointer to the stored value
+  /// \return a pointer to the stored value
   /// \requires a value is stored
   /// \group pointer
   /// \synopsis constexpr const T *operator->() const;
@@ -1615,7 +1640,7 @@ class optional : private detail::optional_move_assign_base<T>,
     return addressof(this->m_value);
   }
 
-  /// \returns the stored value
+  /// \return the stored value
   /// \requires a value is stored
   /// \group deref
   /// \synopsis constexpr T &operator*();
@@ -1643,7 +1668,7 @@ class optional : private detail::optional_move_assign_base<T>,
   constexpr const T &&operator*() const && { return std::move(this->m_value); }
 #endif
 
-  /// \returns whether or not the optional has a value
+  /// \return whether or not the optional has a value
   /// \group has_value
   __thrust_exec_check_disable__
   __host__ __device__
@@ -1656,7 +1681,7 @@ class optional : private detail::optional_move_assign_base<T>,
     return this->m_has_value;
   }
 
-  /// \returns the contained value if there is one, otherwise throws
+  /// \return the contained value if there is one, otherwise throws
   /// [bad_optional_access]
   /// \group value
   /// \synopsis constexpr T &value();
@@ -1692,7 +1717,7 @@ class optional : private detail::optional_move_assign_base<T>,
   }
 #endif
 
-  /// \returns the stored value if there is one, otherwise returns `u`
+  /// \return the stored value if there is one, otherwise returns `u`
   /// \group value_or
   __thrust_exec_check_disable__
   template <class U>
@@ -1813,58 +1838,58 @@ inline constexpr bool operator!=(nullopt_t, const optional<T> &rhs) noexcept {
   return rhs.has_value();
 }
 /// \group relop_nullopt
-__thrust_exec_check_disable__                                                    
-template <class T>                                                               
-__host__ __device__       
+__thrust_exec_check_disable__
+template <class T>
+__host__ __device__
 inline constexpr bool operator<(const optional<T> &, nullopt_t) noexcept {
   return false;
 }
 /// \group relop_nullopt
-__thrust_exec_check_disable__                                                    
-template <class T>                                                               
-__host__ __device__       
+__thrust_exec_check_disable__
+template <class T>
+__host__ __device__
 inline constexpr bool operator<(nullopt_t, const optional<T> &rhs) noexcept {
   return rhs.has_value();
 }
 /// \group relop_nullopt
-__thrust_exec_check_disable__                                                    
-template <class T>                                                               
-__host__ __device__       
+__thrust_exec_check_disable__
+template <class T>
+__host__ __device__
 inline constexpr bool operator<=(const optional<T> &lhs, nullopt_t) noexcept {
   return !lhs.has_value();
 }
 /// \group relop_nullopt
-__thrust_exec_check_disable__                                                    
-template <class T>                                                               
-__host__ __device__       
+__thrust_exec_check_disable__
+template <class T>
+__host__ __device__
 inline constexpr bool operator<=(nullopt_t, const optional<T> &) noexcept {
   return true;
 }
 /// \group relop_nullopt
-__thrust_exec_check_disable__                                                    
-template <class T>                                                               
-__host__ __device__       
+__thrust_exec_check_disable__
+template <class T>
+__host__ __device__
 inline constexpr bool operator>(const optional<T> &lhs, nullopt_t) noexcept {
   return lhs.has_value();
 }
 /// \group relop_nullopt
-__thrust_exec_check_disable__                                                    
-template <class T>                                                               
-__host__ __device__       
+__thrust_exec_check_disable__
+template <class T>
+__host__ __device__
 inline constexpr bool operator>(nullopt_t, const optional<T> &) noexcept {
   return false;
 }
 /// \group relop_nullopt
-__thrust_exec_check_disable__                                                    
-template <class T>                                                               
-__host__ __device__       
+__thrust_exec_check_disable__
+template <class T>
+__host__ __device__
 inline constexpr bool operator>=(const optional<T> &, nullopt_t) noexcept {
   return true;
 }
 /// \group relop_nullopt
-__thrust_exec_check_disable__                                                    
-template <class T>                                                               
-__host__ __device__       
+__thrust_exec_check_disable__
+template <class T>
+__host__ __device__
 inline constexpr bool operator>=(nullopt_t, const optional<T> &rhs) noexcept {
   return !rhs.has_value();
 }
@@ -1997,10 +2022,12 @@ inline constexpr optional<T> make_optional(std::initializer_list<U> il,
   return optional<T>(in_place, il, std::forward<Args>(args)...);
 }
 
-#if __cplusplus >= 201703L
+#if THRUST_CPP_DIALECT >= 2017
 template <class T> optional(T)->optional<T>;
 #endif
 
+// Doxygen chokes on the trailing return types used below.
+#if !defined(THRUST_DOXYGEN)
 /// \exclude
 namespace detail {
 #ifdef THRUST_OPTIONAL_CPP14
@@ -2037,7 +2064,7 @@ template <class Opt, class F,
                                               *std::declval<Opt>())),
           detail::enable_if_t<!std::is_void<Ret>::value> * = nullptr>
 __host__ __device__
-constexpr auto optional_map_impl(Opt &&opt, F &&f) -> optional<Ret> {
+constexpr optional<Ret> optional_map_impl(Opt &&opt, F &&f) {
   return opt.has_value()
              ? detail::invoke(std::forward<F>(f), *std::forward<Opt>(opt))
              : optional<Ret>(nullopt);
@@ -2049,7 +2076,8 @@ template <class Opt, class F,
                                               *std::declval<Opt>())),
           detail::enable_if_t<std::is_void<Ret>::value> * = nullptr>
 __host__ __device__
-auto optional_map_impl(Opt &&opt, F &&f) -> optional<monostate> {
+auto optional_map_impl(Opt &&opt, F &&f) -> optional<monostate>
+{
   if (opt.has_value()) {
     detail::invoke(std::forward<F>(f), *std::forward<Opt>(opt));
     return monostate{};
@@ -2059,6 +2087,7 @@ auto optional_map_impl(Opt &&opt, F &&f) -> optional<monostate> {
 }
 #endif
 } // namespace detail
+#endif // !defined(THRUST_DOXYGEN)
 
 /// Specialization for when `T` is a reference. `optional<T&>` acts similarly
 /// to a `T*`, but provides more operations and shows intent more clearly.
@@ -2087,13 +2116,13 @@ template <class T> class optional<T &> {
 // The different versions for C++14 and 11 are needed because deduced return
 // types are not SFINAE-safe. This provides better support for things like
 // generic lambdas. C.f.
-// http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2017/p0826r0.html
+// http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2017/p0826r0
 #if defined(THRUST_OPTIONAL_CPP14) && !defined(THRUST_OPTIONAL_GCC49) &&               \
     !defined(THRUST_OPTIONAL_GCC54) && !defined(THRUST_OPTIONAL_GCC55)
   /// \group and_then
   /// Carries out some operation which returns an optional on the stored
   /// object if there is one. \requires `std::invoke(std::forward<F>(f),
-  /// value())` returns a `std::optional<U>` for some `U`. \returns Let `U` be
+  /// value())` returns a `std::optional<U>` for some `U`. \return Let `U` be
   /// the result of `std::invoke(std::forward<F>(f), value())`. Returns a
   /// `std::optional<U>`. The return value is empty if `*this` is empty,
   /// otherwise the return value of `std::invoke(std::forward<F>(f), value())`
@@ -2159,7 +2188,7 @@ template <class T> class optional<T &> {
   /// \group and_then
   /// Carries out some operation which returns an optional on the stored
   /// object if there is one. \requires `std::invoke(std::forward<F>(f),
-  /// value())` returns a `std::optional<U>` for some `U`. \returns Let `U` be
+  /// value())` returns a `std::optional<U>` for some `U`. \return Let `U` be
   /// the result of `std::invoke(std::forward<F>(f), value())`. Returns a
   /// `std::optional<U>`. The return value is empty if `*this` is empty,
   /// otherwise the return value of `std::invoke(std::forward<F>(f), value())`
@@ -2226,7 +2255,7 @@ template <class T> class optional<T &> {
 #if defined(THRUST_OPTIONAL_CPP14) && !defined(THRUST_OPTIONAL_GCC49) &&               \
     !defined(THRUST_OPTIONAL_GCC54) && !defined(THRUST_OPTIONAL_GCC55)
   /// \brief Carries out some operation on the stored object if there is one.
-  /// \returns Let `U` be the result of `std::invoke(std::forward<F>(f),
+  /// \return Let `U` be the result of `std::invoke(std::forward<F>(f),
   /// value())`. Returns a `std::optional<U>`. The return value is empty if
   /// `*this` is empty, otherwise an `optional<U>` is constructed from the
   /// return value of `std::invoke(std::forward<F>(f), value())` and is
@@ -2269,7 +2298,7 @@ template <class T> class optional<T &> {
   }
 #else
   /// \brief Carries out some operation on the stored object if there is one.
-  /// \returns Let `U` be the result of `std::invoke(std::forward<F>(f),
+  /// \return Let `U` be the result of `std::invoke(std::forward<F>(f),
   /// value())`. Returns a `std::optional<U>`. The return value is empty if
   /// `*this` is empty, otherwise an `optional<U>` is constructed from the
   /// return value of `std::invoke(std::forward<F>(f), value())` and is
@@ -2511,7 +2540,7 @@ template <class T> class optional<T &> {
   }
 #endif
 
-  /// \returns `u` if `*this` has a value, otherwise an empty optional.
+  /// \return `u` if `*this` has a value, otherwise an empty optional.
   __thrust_exec_check_disable__
   template <class U>
   __host__ __device__
@@ -2520,7 +2549,7 @@ template <class T> class optional<T &> {
     return has_value() ? result{u} : result{nullopt};
   }
 
-  /// \returns `rhs` if `*this` is empty, otherwise the current value.
+  /// \return `rhs` if `*this` is empty, otherwise the current value.
   /// \group disjunction
   __thrust_exec_check_disable__
   __host__ __device__
@@ -2737,7 +2766,7 @@ template <class T> class optional<T &> {
   __host__ __device__
   void swap(optional &rhs) noexcept { std::swap(m_value, rhs.m_value); }
 
-  /// \returns a pointer to the stored value
+  /// \return a pointer to the stored value
   /// \requires a value is stored
   /// \group pointer
   /// \synopsis constexpr const T *operator->() const;
@@ -2751,7 +2780,7 @@ template <class T> class optional<T &> {
   __host__ __device__
   THRUST_OPTIONAL_CPP11_CONSTEXPR T *operator->() { return m_value; }
 
-  /// \returns the stored value
+  /// \return the stored value
   /// \requires a value is stored
   /// \group deref
   /// \synopsis constexpr T &operator*();
@@ -2764,7 +2793,7 @@ template <class T> class optional<T &> {
   __host__ __device__
   constexpr const T &operator*() const { return *m_value; }
 
-  /// \returns whether or not the optional has a value
+  /// \return whether or not the optional has a value
   /// \group has_value
   __thrust_exec_check_disable__
   __host__ __device__
@@ -2777,7 +2806,7 @@ template <class T> class optional<T &> {
     return m_value != nullptr;
   }
 
-  /// \returns the contained value if there is one, otherwise throws
+  /// \return the contained value if there is one, otherwise throws
   /// [bad_optional_access]
   /// \group value
   /// synopsis constexpr T &value();
@@ -2796,7 +2825,7 @@ template <class T> class optional<T &> {
     throw bad_optional_access();
   }
 
-  /// \returns the stored value if there is one, otherwise returns `u`
+  /// \return the stored value if there is one, otherwise returns `u`
   /// \group value_or
   __thrust_exec_check_disable__
   template <class U>
@@ -2827,18 +2856,18 @@ template <class T> class optional<T &> {
   T *m_value;
 };
 
-THRUST_END_NS
+THRUST_NAMESPACE_END
 
 namespace std {
 // TODO SFINAE
-template <class T> struct hash<thrust::optional<T>> {
+template <class T> struct hash<THRUST_NS_QUALIFIER::optional<T>> {
   __thrust_exec_check_disable__
   __host__ __device__
-  ::std::size_t operator()(const thrust::optional<T> &o) const {
+  ::std::size_t operator()(const THRUST_NS_QUALIFIER::optional<T> &o) const {
     if (!o.has_value())
       return 0;
 
-    return std::hash<thrust::detail::remove_const_t<T>>()(*o);
+    return std::hash<THRUST_NS_QUALIFIER::detail::remove_const_t<T>>()(*o);
   }
 };
 } // namespace std
diff --git a/thrust/pair.h b/thrust/pair.h
index 48da892c7..eb2138aaf 100644
--- a/thrust/pair.h
+++ b/thrust/pair.h
@@ -23,8 +23,7 @@
 #include <thrust/detail/config.h>
 #include <utility>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup utility
  *  \{
@@ -119,8 +118,8 @@ template <typename T1, typename T2>
  *  \param y The second \p pair to compare.
  *  \return \c true if and only if <tt>x.first == y.first && x.second == y.second</tt>.
  *  
- *  \tparam T1 is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
- *  \tparam T2 is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
+ *  \tparam T1 is a model of <a href="https://en.cppreference.com/w/cpp/concepts/equality_comparable">Equality Comparable</a>.
+ *  \tparam T2 is a model of <a href="https://en.cppreference.com/w/cpp/concepts/equality_comparable">Equality Comparable</a>.
  */
 template <typename T1, typename T2>
   inline __host__ __device__
@@ -133,8 +132,8 @@ template <typename T1, typename T2>
  *  \param y The second \p pair to compare.
  *  \return \c true if and only if <tt>x.first < y.first || (!(y.first < x.first) && x.second < y.second)</tt>.
  *
- *  \tparam T1 is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
- *  \tparam T2 is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *  \tparam T1 is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>.
+ *  \tparam T2 is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>.
  */
 template <typename T1, typename T2>
   inline __host__ __device__
@@ -147,8 +146,8 @@ template <typename T1, typename T2>
  *  \param y The second \p pair to compare.
  *  \return \c true if and only if <tt>!(x == y)</tt>.
  *
- *  \tparam T1 is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
- *  \tparam T2 is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
+ *  \tparam T1 is a model of <a href="https://en.cppreference.com/w/cpp/concepts/equality_comparable">Equality Comparable</a>.
+ *  \tparam T2 is a model of <a href="https://en.cppreference.com/w/cpp/concepts/equality_comparable">Equality Comparable</a>.
  */
 template <typename T1, typename T2>
   inline __host__ __device__
@@ -161,8 +160,8 @@ template <typename T1, typename T2>
  *  \param y The second \p pair to compare.
  *  \return \c true if and only if <tt>y < x</tt>.
  *
- *  \tparam T1 is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
- *  \tparam T2 is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *  \tparam T1 is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>.
+ *  \tparam T2 is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>.
  */
 template <typename T1, typename T2>
   inline __host__ __device__
@@ -175,8 +174,8 @@ template <typename T1, typename T2>
  *  \param y The second \p pair to compare.
  *  \return \c true if and only if <tt>!(y < x)</tt>.
  *
- *  \tparam T1 is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
- *  \tparam T2 is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *  \tparam T1 is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>.
+ *  \tparam T2 is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>.
  */
 template <typename T1, typename T2>
   inline __host__ __device__
@@ -189,8 +188,8 @@ template <typename T1, typename T2>
  *  \param y The second \p pair to compare.
  *  \return \c true if and only if <tt>!(x < y)</tt>.
  *
- *  \tparam T1 is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
- *  \tparam T2 is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *  \tparam T1 is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>.
+ *  \tparam T2 is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>.
  */
 template <typename T1, typename T2>
   inline __host__ __device__
@@ -228,7 +227,7 @@ template <typename T1, typename T2>
  *  \tparam N This parameter selects the member of interest.
  *  \tparam T A \c pair type of interest.
  */
-template<int N, typename T> struct tuple_element;
+template<size_t N, class T> struct tuple_element;
 
 
 /*! This convenience metafunction is included for compatibility with
@@ -277,7 +276,6 @@ template<typename Pair> struct tuple_size;
 /*! \} // utility
  */
 
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/pair.inl>
-
diff --git a/thrust/partition.h b/thrust/partition.h
index 3c493e088..90768f246 100644
--- a/thrust/partition.h
+++ b/thrust/partition.h
@@ -25,9 +25,7 @@
 #include <thrust/detail/execution_policy.h>
 #include <thrust/pair.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup reordering
  *  \ingroup algorithms
@@ -61,10 +59,10 @@ namespace thrust
  *          the sequence of the elements which do not satisfy \p pred.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator's \c value_type is convertible to \p Predicate's \c argument_type,
  *          and \p ForwardIterator is mutable.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  The following code snippet demonstrates how to use \p partition to reorder a
  *  sequence so that even numbers precede odd numbers using the \p thrust::host execution policy for parallelization:
@@ -90,7 +88,7 @@ namespace thrust
  *  // A is now {2, 4, 6, 8, 10, 1, 3, 5, 7, 9}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/partition.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/partition
  *  \see \p stable_partition
  *  \see \p partition_copy
  */
@@ -123,10 +121,10 @@ __host__ __device__
  *  \return An iterator referring to the first element of the second partition, that is,
  *          the sequence of the elements which do not satisfy \p pred.
  *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator's \c value_type is convertible to \p Predicate's \c argument_type,
  *          and \p ForwardIterator is mutable.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  The following code snippet demonstrates how to use \p partition to reorder a
  *  sequence so that even numbers precede odd numbers.
@@ -150,7 +148,7 @@ __host__ __device__
  *  // A is now {2, 4, 6, 8, 10, 1, 3, 5, 7, 9}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/partition.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/partition
  *  \see \p stable_partition
  *  \see \p partition_copy
  */
@@ -186,11 +184,11 @@ template<typename ForwardIterator,
  *          the sequence of the elements whose stencil elements do not satisfy \p pred.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator is mutable.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \pre The ranges <tt>[first,last)</tt> and <tt>[stencil, stencil + (last - first))</tt> shall not overlap.
  *
@@ -218,7 +216,7 @@ template<typename ForwardIterator,
  *  // S is unmodified
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/partition.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/partition
  *  \see \p stable_partition
  *  \see \p partition_copy
  */
@@ -255,11 +253,11 @@ __host__ __device__
  *  \return An iterator referring to the first element of the second partition, that is,
  *          the sequence of the elements whose stencil elements do not satisfy \p pred.
  *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator is mutable.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \pre The ranges <tt>[first,last)</tt> and <tt>[stencil, stencil + (last - first))</tt> shall not overlap.
  *
@@ -286,7 +284,7 @@ __host__ __device__
  *  // S is unmodified
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/partition.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/partition
  *  \see \p stable_partition
  *  \see \p partition_copy
  */
@@ -321,12 +319,12 @@ template<typename ForwardIterator,
  *          \p out_false.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type and \p InputIterator's \c value_type
  *          is convertible to \p OutputIterator1 and \p OutputIterator2's \c value_types.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \pre The input range shall not overlap with either output range.
  *
@@ -399,12 +397,12 @@ __host__ __device__
  *          at \p out_true and <tt>p.second</tt> is the end of the output range beginning at
  *          \p out_false.
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type and \p InputIterator's \c value_type
  *          is convertible to \p OutputIterator1 and \p OutputIterator2's \c value_types.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \pre The input range shall not overlap with either output range.
  *
@@ -479,13 +477,13 @@ template<typename InputIterator,
  *          \p out_false.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator's \c value_type is convertible to \p OutputIterator1 and \p OutputIterator2's \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator2's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \pre The input ranges shall not overlap with either output range.
  *
@@ -557,13 +555,13 @@ __host__ __device__
  *          at \p out_true and <tt>p.second</tt> is the end of the output range beginning at
  *          \p out_false.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator's \c value_type is convertible to \p OutputIterator1 and \p OutputIterator2's \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator2's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \pre The input ranges shall not overlap with either output range.
  *
@@ -636,10 +634,10 @@ template<typename InputIterator1,
  *          the sequence of the elements which do not satisfy pred.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator's \c value_type is convertible to \p Predicate's \c argument_type,
  *          and \p ForwardIterator is mutable.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  The following code snippet demonstrates how to use \p stable_partition to reorder a
  *  sequence so that even numbers precede odd numbers using the \p thrust::host execution policy for parallelization:
@@ -665,7 +663,7 @@ template<typename InputIterator1,
  *  // A is now {2, 4, 6, 8, 10, 1, 3, 5, 7, 9}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/stable_partition.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/stable_partition
  *  \see \p partition
  *  \see \p stable_partition_copy
  */
@@ -701,10 +699,10 @@ __host__ __device__
  *  \return An iterator referring to the first element of the second partition, that is,
  *          the sequence of the elements which do not satisfy pred.
  *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator's \c value_type is convertible to \p Predicate's \c argument_type,
  *          and \p ForwardIterator is mutable.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  The following code snippet demonstrates how to use \p stable_partition to reorder a
  *  sequence so that even numbers precede odd numbers.
@@ -728,7 +726,7 @@ __host__ __device__
  *  // A is now {2, 4, 6, 8, 10, 1, 3, 5, 7, 9}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/stable_partition.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/stable_partition
  *  \see \p partition
  *  \see \p stable_partition_copy
  */
@@ -766,11 +764,11 @@ template<typename ForwardIterator,
  *          the sequence of the elements whose stencil elements do not satisfy \p pred.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator is mutable.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \pre The range <tt>[first, last)</tt> shall not overlap with the range <tt>[stencil, stencil + (last - first))</tt>.
  *
@@ -798,7 +796,7 @@ template<typename ForwardIterator,
  *  // S is unmodified
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/stable_partition.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/stable_partition
  *  \see \p partition
  *  \see \p stable_partition_copy
  */
@@ -837,11 +835,11 @@ __host__ __device__
  *  \return An iterator referring to the first element of the second partition, that is,
  *          the sequence of the elements whose stencil elements do not satisfy \p pred.
  *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator is mutable.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \pre The range <tt>[first, last)</tt> shall not overlap with the range <tt>[stencil, stencil + (last - first))</tt>.
  *
@@ -868,7 +866,7 @@ __host__ __device__
  *  // S is unmodified
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/stable_partition.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/stable_partition
  *  \see \p partition
  *  \see \p stable_partition_copy
  */
@@ -909,12 +907,12 @@ template<typename ForwardIterator,
  *          \p out_false.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type and \p InputIterator's \c value_type
  *          is convertible to \p OutputIterator1 and \p OutputIterator2's \c value_types.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \pre The input ranges shall not overlap with either output range.
  *
@@ -989,12 +987,12 @@ __host__ __device__
  *          at \p out_true and <tt>p.second</tt> is the end of the output range beginning at
  *          \p out_false.
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type and \p InputIterator's \c value_type
  *          is convertible to \p OutputIterator1 and \p OutputIterator2's \c value_types.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \pre The input ranges shall not overlap with either output range.
  *
@@ -1071,13 +1069,13 @@ template<typename InputIterator,
  *          \p out_false.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator's \c value_type is convertible to \p OutputIterator1 and \p OutputIterator2's \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator2's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \pre The input ranges shall not overlap with either output range.
  *
@@ -1150,13 +1148,13 @@ __host__ __device__
  *          at \p out_true and <tt>p.second</tt> is the end of the output range beginning at
  *          \p out_false.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator's \c value_type is convertible to \p OutputIterator1 and \p OutputIterator2's \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator2's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \pre The input ranges shall not overlap with either output range.
  *
@@ -1226,9 +1224,9 @@ template<typename InputIterator1,
  *          and <tt>none_of(mid, last, pred)</tt> are both true.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \pre The range <tt>[first, last)</tt> shall be partitioned by \p pred.
  *
@@ -1279,9 +1277,9 @@ __host__ __device__
  *  \return An iterator \c mid such that <tt>all_of(first, mid, pred)</tt>
  *          and <tt>none_of(mid, last, pred)</tt> are both true.
  *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \pre The range <tt>[first, last)</tt> shall be partitioned by \p pred.
  *
@@ -1345,9 +1343,9 @@ template<typename ForwardIterator, typename Predicate>
  *          to \p pred, or if <tt>[first, last)</tt> is empty. \c false, otherwise.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Input Iterator</a>,
  *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *  
  *  \code
  *  #include <thrust/partition.h>
@@ -1395,9 +1393,9 @@ __host__ __device__
  *  \return \c true if the range <tt>[first, last)</tt> is partitioned with respect
  *          to \p pred, or if <tt>[first, last)</tt> is empty. \c false, otherwise.
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Input Iterator</a>,
  *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *  
  *  \code
  *  #include <thrust/partition.h>
@@ -1432,8 +1430,7 @@ template<typename InputIterator, typename Predicate>
  *  \} // end reductions
  */
 
-
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/partition.inl>
 
diff --git a/thrust/per_device_resource.h b/thrust/per_device_resource.h
index 91d4d9a0d..a6d620f85 100644
--- a/thrust/per_device_resource.h
+++ b/thrust/per_device_resource.h
@@ -28,13 +28,13 @@
 #include <thrust/detail/execution_policy.h>
 #include <thrust/mr/allocator.h>
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
 
 /*! Returns a global instance of \p MR for the current device of the provided system.
  *
  *  \tparam MR type of a memory resource to get an instance from. Must be \p DefaultConstructible.
  *  \param system execution policy for which the resource is requested.
- *  \returns a pointer to a global instance of \p MR for the current device.
+ *  \return a pointer to a global instance of \p MR for the current device.
  */
 template<typename MR, typename DerivedPolicy>
 __host__
@@ -97,7 +97,6 @@ class per_device_allocator : public thrust::mr::allocator<T, Upstream>
     ~per_device_allocator() {}
 };
 
-
-THRUST_END_NS
+THRUST_NAMESPACE_END
 
 #endif // THRUST_CPP_DIALECT >= 2011
diff --git a/thrust/random.h b/thrust/random.h
index c0e9e2282..7463620b7 100644
--- a/thrust/random.h
+++ b/thrust/random.h
@@ -35,9 +35,7 @@
 #include <thrust/random/uniform_real_distribution.h>
 #include <thrust/random/normal_distribution.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup random Random Number Generation
  *  \{
@@ -116,5 +114,4 @@ using random::ranlux48;
 using random::taus88;
 using random::default_random_engine;
 
-} // end thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/random/detail/discard_block_engine.inl b/thrust/random/detail/discard_block_engine.inl
index fca16c2bf..31128e250 100644
--- a/thrust/random/detail/discard_block_engine.inl
+++ b/thrust/random/detail/discard_block_engine.inl
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2021 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -14,10 +14,13 @@
  *  limitations under the License.
  */
 
+#pragma once
+
+#include <thrust/detail/config.h>
+
 #include <thrust/random/discard_block_engine.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace random
 {
@@ -208,5 +211,5 @@ bool operator!=(const discard_block_engine<Engine,p,r> &lhs,
 
 } // end random
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/random/detail/linear_congruential_engine.inl b/thrust/random/detail/linear_congruential_engine.inl
index da0b03e15..fa9fd7d0d 100644
--- a/thrust/random/detail/linear_congruential_engine.inl
+++ b/thrust/random/detail/linear_congruential_engine.inl
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2021 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -14,12 +14,15 @@
  *  limitations under the License.
  */
 
+#pragma once
+
+#include <thrust/detail/config.h>
+
 #include <thrust/random/linear_congruential_engine.h>
 #include <thrust/random/detail/mod.h>
 #include <thrust/random/detail/random_core_access.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace random
 {
@@ -165,5 +168,5 @@ operator>>(std::basic_istream<CharT,Traits> &is,
 
 } // end random
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/random/detail/linear_congruential_engine_discard.h b/thrust/random/detail/linear_congruential_engine_discard.h
index 381595144..c8103d9dc 100644
--- a/thrust/random/detail/linear_congruential_engine_discard.h
+++ b/thrust/random/detail/linear_congruential_engine_discard.h
@@ -16,11 +16,12 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/detail/cstdint.h>
 #include <thrust/random/detail/mod.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace random
 {
@@ -103,5 +104,5 @@ struct linear_congruential_engine_discard
 
 } // end random
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/random/detail/linear_feedback_shift_engine.inl b/thrust/random/detail/linear_feedback_shift_engine.inl
index b5d55be15..ac3ca8673 100644
--- a/thrust/random/detail/linear_feedback_shift_engine.inl
+++ b/thrust/random/detail/linear_feedback_shift_engine.inl
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2021 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -14,10 +14,13 @@
  *  limitations under the License.
  */
 
+#pragma once
+
+#include <thrust/detail/config.h>
+
 #include <thrust/random/linear_feedback_shift_engine.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace random
 {
@@ -161,5 +164,5 @@ operator>>(std::basic_istream<CharT,Traits> &is,
 
 } // end random
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/random/detail/linear_feedback_shift_engine_wordmask.h b/thrust/random/detail/linear_feedback_shift_engine_wordmask.h
index 6669350ea..73c8ae83e 100644
--- a/thrust/random/detail/linear_feedback_shift_engine_wordmask.h
+++ b/thrust/random/detail/linear_feedback_shift_engine_wordmask.h
@@ -16,8 +16,9 @@
 
 #pragma once
 
-namespace thrust
-{
+#include <thrust/detail/config.h>
+
+THRUST_NAMESPACE_BEGIN
 
 namespace random
 {
@@ -43,5 +44,5 @@ template<typename T, int w>
 
 } // end random
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/random/detail/mod.h b/thrust/random/detail/mod.h
index ed6afcf03..f0637582d 100644
--- a/thrust/random/detail/mod.h
+++ b/thrust/random/detail/mod.h
@@ -16,8 +16,9 @@
 
 #pragma once
 
-namespace thrust
-{
+#include <thrust/detail/config.h>
+
+THRUST_NAMESPACE_BEGIN
 
 namespace random
 {
@@ -34,7 +35,7 @@ template<typename T, T a, T c, T m, bool = (m == 0)>
   __host__ __device__
   T operator()(T x) const
   {
-    if(a == 1)
+    THRUST_IF_CONSTEXPR(a == 1)
     {
       x %= m;
     }
@@ -52,7 +53,7 @@ template<typename T, T a, T c, T m, bool = (m == 0)>
       }
     }
 
-    if(c != 0)
+    THRUST_IF_CONSTEXPR(c != 0)
     {
       const T d = m - x;
       if(d > c)
@@ -93,5 +94,5 @@ __host__ __device__
 
 } // end random
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/random/detail/normal_distribution.inl b/thrust/random/detail/normal_distribution.inl
index 099a977f3..4b69bab21 100644
--- a/thrust/random/detail/normal_distribution.inl
+++ b/thrust/random/detail/normal_distribution.inl
@@ -1,6 +1,5 @@
 /*
- *
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2021 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -15,6 +14,10 @@
  *  limitations under the License.
  */
 
+#pragma once
+
+#include <thrust/detail/config.h>
+
 #include <thrust/random/normal_distribution.h>
 #include <thrust/random/uniform_real_distribution.h>
 #include <thrust/detail/cstdint.h>
@@ -27,8 +30,7 @@
 #include <limits>
 #endif
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace random
 {
@@ -251,5 +253,5 @@ operator>>(std::basic_istream<CharT,Traits> &is,
 
 } // end random
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/random/detail/normal_distribution_base.h b/thrust/random/detail/normal_distribution_base.h
index 6c11af62b..a42e80014 100644
--- a/thrust/random/detail/normal_distribution_base.h
+++ b/thrust/random/detail/normal_distribution_base.h
@@ -29,14 +29,13 @@
 #include <limits>
 #include <cmath>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace random
 {
 namespace detail
 {
 
-// this version samples the normal distribution directly 
+// this version samples the normal distribution directly
 // and uses the non-standard math function erfcinv
 template<typename RealType>
   class normal_distribution_nvcc
@@ -46,15 +45,15 @@ template<typename RealType>
     __host__ __device__
     RealType sample(UniformRandomNumberGenerator &urng, const RealType mean, const RealType stddev)
     {
-      typedef typename UniformRandomNumberGenerator::result_type uint_type;
-      const uint_type urng_range = UniformRandomNumberGenerator::max - UniformRandomNumberGenerator::min;
+      using uint_type = typename UniformRandomNumberGenerator::result_type;
+      constexpr uint_type urng_range = UniformRandomNumberGenerator::max - UniformRandomNumberGenerator::min;
 
       // Constants for conversion
-      const RealType S1 = static_cast<RealType>(1) / urng_range;
-      const RealType S2 = S1 / 2;
+      constexpr RealType S1 = static_cast<RealType>(1. / static_cast<double>(urng_range));
+      constexpr RealType S2 = S1 / 2;
 
       RealType S3 = static_cast<RealType>(-1.4142135623730950488016887242097); // -sqrt(2)
-      
+
       // Get the integer value
       uint_type u = urng() - UniformRandomNumberGenerator::min;
 
@@ -77,7 +76,7 @@ template<typename RealType>
     void reset() {}
 };
 
-// this version samples the normal distribution using 
+// this version samples the normal distribution using
 // Marsaglia's "polar method"
 template<typename RealType>
   class normal_distribution_portable
@@ -136,7 +135,7 @@ template<typename RealType>
 template<typename RealType>
   struct normal_distribution_base
 {
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC && !defined(_NVHPC_CUDA)
   typedef normal_distribution_nvcc<RealType> type;
 #else
   typedef normal_distribution_portable<RealType> type;
@@ -145,5 +144,5 @@ template<typename RealType>
 
 } // end detail
 } // end random
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/random/detail/random_core_access.h b/thrust/random/detail/random_core_access.h
index f03060e0a..a3e34e02b 100644
--- a/thrust/random/detail/random_core_access.h
+++ b/thrust/random/detail/random_core_access.h
@@ -16,8 +16,9 @@
 
 #pragma once
 
-namespace thrust
-{
+#include <thrust/detail/config.h>
+
+THRUST_NAMESPACE_BEGIN
 
 namespace random
 {
@@ -53,5 +54,5 @@ static bool equal(const EngineOrDistribution &lhs, const EngineOrDistribution &r
 
 } // end random
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/random/detail/subtract_with_carry_engine.inl b/thrust/random/detail/subtract_with_carry_engine.inl
index 9b4a4c45c..21c22fe77 100644
--- a/thrust/random/detail/subtract_with_carry_engine.inl
+++ b/thrust/random/detail/subtract_with_carry_engine.inl
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2021 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -14,13 +14,16 @@
  *  limitations under the License.
  */
 
+#pragma once
+
+#include <thrust/detail/config.h>
+
 #include <thrust/random/linear_congruential_engine.h>
 #include <thrust/random/subtract_with_carry_engine.h>
 #include <thrust/random/detail/mod.h>
 #include <thrust/random/detail/random_core_access.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace random
 {
@@ -105,19 +108,19 @@ template<typename UIntType, size_t w, size_t s, size_t r>
 {
   typedef std::basic_ostream<CharT,Traits> ostream_type;
   typedef typename ostream_type::ios_base     ios_base;
-                  
+
   const typename ios_base::fmtflags flags = os.flags();
   const CharT fill  = os.fill();
   const CharT space = os.widen(' ');
   os.flags(ios_base::dec | ios_base::fixed | ios_base::left);
   os.fill(space);
 
-  const UIntType long_lag = r;
-                                                          
+  const UIntType long_lag_ = r;
+
   for(size_t i = 0; i < r; ++i)
-    os << m_x[(i + m_k) % long_lag] << space;
+    os << m_x[(i + m_k) % long_lag_] << space;
   os << m_carry;
-                                                                          
+
   os.flags(flags);
   os.fill(fill);
   return os;
@@ -151,12 +154,12 @@ template<typename UIntType, size_t w, size_t s, size_t r>
   bool subtract_with_carry_engine<UIntType,w,s,r>
     ::equal(const subtract_with_carry_engine<UIntType,w,s,r> &rhs) const
 {
-  const UIntType long_lag = r;
+  const UIntType long_lag_ = r;
 
   bool result = true;
   for(size_t i = 0; i < r; ++i)
   {
-    result &= (m_x[(i + m_k) % long_lag] == rhs.m_x[(i + rhs.m_k) % long_lag]);
+    result &= (m_x[(i + m_k) % long_lag_] == rhs.m_x[(i + rhs.m_k) % long_lag_]);
   }
 
   // XXX not sure if this last check is necessary
@@ -206,5 +209,5 @@ template<typename UIntType, size_t w, size_t s, size_t r>
 
 } // end random
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/random/detail/uniform_int_distribution.inl b/thrust/random/detail/uniform_int_distribution.inl
index 18eb5194c..064bfcc73 100644
--- a/thrust/random/detail/uniform_int_distribution.inl
+++ b/thrust/random/detail/uniform_int_distribution.inl
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2021 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -14,12 +14,15 @@
  *  limitations under the License.
  */
 
+#pragma once
+
+#include <thrust/detail/config.h>
+
 #include <thrust/random/uniform_int_distribution.h>
 #include <thrust/random/uniform_real_distribution.h>
 #include <thrust/detail/type_traits.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace random
 {
@@ -242,5 +245,5 @@ operator>>(std::basic_istream<CharT,Traits> &is,
 
 } // end random
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/random/detail/uniform_real_distribution.inl b/thrust/random/detail/uniform_real_distribution.inl
index ec4f21e9e..119f82c1e 100644
--- a/thrust/random/detail/uniform_real_distribution.inl
+++ b/thrust/random/detail/uniform_real_distribution.inl
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2021 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -14,10 +14,13 @@
  *  limitations under the License.
  */
 
+#pragma once
+
+#include <thrust/detail/config.h>
+
 #include <thrust/random/uniform_real_distribution.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace random
 {
@@ -227,5 +230,5 @@ operator>>(std::basic_istream<CharT,Traits> &is,
 
 } // end random
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/random/detail/xor_combine_engine.inl b/thrust/random/detail/xor_combine_engine.inl
index d24865f68..c94821443 100644
--- a/thrust/random/detail/xor_combine_engine.inl
+++ b/thrust/random/detail/xor_combine_engine.inl
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2021 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -14,11 +14,14 @@
  *  limitations under the License.
  */
 
+#pragma once
+
+#include <thrust/detail/config.h>
+
 #include <thrust/random/xor_combine_engine.h>
 #include <thrust/random/detail/random_core_access.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace random
 {
@@ -211,5 +214,5 @@ bool operator!=(const xor_combine_engine<Engine1,s1,Engine2,s2> &lhs,
 
 } // end random
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/random/detail/xor_combine_engine_max.h b/thrust/random/detail/xor_combine_engine_max.h
index cfb5bdc83..0756ff9e0 100644
--- a/thrust/random/detail/xor_combine_engine_max.h
+++ b/thrust/random/detail/xor_combine_engine_max.h
@@ -16,13 +16,14 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/detail/type_traits.h>
 #include <thrust/detail/mpl/math.h>
 #include <limits>
 #include <cstddef>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace random
 {
@@ -320,5 +321,5 @@ template<typename Engine1, size_t s1, typename Engine2, size_t s2, typename resu
 
 } // end random
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/random/discard_block_engine.h b/thrust/random/discard_block_engine.h
index 2d73649c2..88e115586 100644
--- a/thrust/random/discard_block_engine.h
+++ b/thrust/random/discard_block_engine.h
@@ -29,8 +29,7 @@
 #include <thrust/detail/cstdint.h>
 #include <thrust/random/detail/random_core_access.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace random
 {
@@ -246,7 +245,7 @@ operator>>(std::basic_istream<CharT,Traits> &is,
 // import names into thrust::
 using random::discard_block_engine;
 
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/random/detail/discard_block_engine.inl>
 
diff --git a/thrust/random/linear_congruential_engine.h b/thrust/random/linear_congruential_engine.h
index 0dc72b3b1..dac03d90e 100644
--- a/thrust/random/linear_congruential_engine.h
+++ b/thrust/random/linear_congruential_engine.h
@@ -27,8 +27,7 @@
 #include <thrust/random/detail/random_core_access.h>
 #include <thrust/random/detail/linear_congruential_engine_discard.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace random
 {
@@ -289,7 +288,7 @@ using random::linear_congruential_engine;
 using random::minstd_rand;
 using random::minstd_rand0;
 
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/random/detail/linear_congruential_engine.inl>
 
diff --git a/thrust/random/linear_feedback_shift_engine.h b/thrust/random/linear_feedback_shift_engine.h
index 90c572c9b..a46c6d8ab 100644
--- a/thrust/random/linear_feedback_shift_engine.h
+++ b/thrust/random/linear_feedback_shift_engine.h
@@ -35,8 +35,7 @@
 #include <cstddef> // for size_t
 #include <thrust/random/detail/random_core_access.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 
 namespace random
@@ -224,7 +223,7 @@ operator>>(std::basic_istream<CharT,Traits> &is,
 // import names into thrust::
 using random::linear_feedback_shift_engine;
 
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/random/detail/linear_feedback_shift_engine.inl>
 
diff --git a/thrust/random/normal_distribution.h b/thrust/random/normal_distribution.h
index ac45e161a..36b985cb6 100644
--- a/thrust/random/normal_distribution.h
+++ b/thrust/random/normal_distribution.h
@@ -27,8 +27,7 @@
 #include <thrust/random/detail/normal_distribution_base.h>
 #include <iostream>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace random
 {
@@ -269,7 +268,7 @@ operator>>(std::basic_istream<CharT,Traits> &is,
 
 using random::normal_distribution;
 
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/random/detail/normal_distribution.inl>
 
diff --git a/thrust/random/subtract_with_carry_engine.h b/thrust/random/subtract_with_carry_engine.h
index 0b12ca353..69ee841fd 100644
--- a/thrust/random/subtract_with_carry_engine.h
+++ b/thrust/random/subtract_with_carry_engine.h
@@ -28,8 +28,7 @@
 #include <cstddef> // for size_t
 #include <iostream>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace random
 {
@@ -250,7 +249,7 @@ using random::subtract_with_carry_engine;
 using random::ranlux24_base;
 using random::ranlux48_base;
 
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/random/detail/subtract_with_carry_engine.inl>
 
diff --git a/thrust/random/uniform_int_distribution.h b/thrust/random/uniform_int_distribution.h
index 42d745781..18f369fc2 100644
--- a/thrust/random/uniform_int_distribution.h
+++ b/thrust/random/uniform_int_distribution.h
@@ -27,8 +27,7 @@
 #include <thrust/random/detail/random_core_access.h>
 #include <iostream>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace random
 {
@@ -110,7 +109,8 @@ template<typename IntType = int>
      *           the platform.
      */
     __host__ __device__
-    explicit uniform_int_distribution(IntType a = 0, IntType b = thrust::detail::integer_traits<IntType>::const_max);
+    explicit uniform_int_distribution(IntType a = 0,
+                                      IntType b = THRUST_NS_QUALIFIER::detail::integer_traits<IntType>::const_max);
 
     /*! This constructor creates a new \p uniform_int_distribution from a \p param_type object
      *  encapsulating the range of the distribution.
@@ -270,7 +270,7 @@ operator>>(std::basic_istream<CharT,Traits> &is,
 
 using random::uniform_int_distribution;
 
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/random/detail/uniform_int_distribution.inl>
 
diff --git a/thrust/random/uniform_real_distribution.h b/thrust/random/uniform_real_distribution.h
index 312104570..e6c5a7d88 100644
--- a/thrust/random/uniform_real_distribution.h
+++ b/thrust/random/uniform_real_distribution.h
@@ -26,8 +26,7 @@
 #include <thrust/random/detail/random_core_access.h>
 #include <iostream>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace random
 {
@@ -268,7 +267,7 @@ operator>>(std::basic_istream<CharT,Traits> &is,
 
 using random::uniform_real_distribution;
 
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/random/detail/uniform_real_distribution.inl>
 
diff --git a/thrust/random/xor_combine_engine.h b/thrust/random/xor_combine_engine.h
index d5e86b7a9..321f04033 100644
--- a/thrust/random/xor_combine_engine.h
+++ b/thrust/random/xor_combine_engine.h
@@ -29,8 +29,7 @@
 #include <iostream>
 #include <cstddef> // for size_t
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace random
 {
@@ -265,7 +264,7 @@ operator>>(std::basic_istream<CharT,Traits> &is,
 // import names into thrust::
 using random::xor_combine_engine;
 
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/random/detail/xor_combine_engine.inl>
 
diff --git a/thrust/reduce.h b/thrust/reduce.h
index cabb83c37..c7b378f72 100644
--- a/thrust/reduce.h
+++ b/thrust/reduce.h
@@ -26,9 +26,7 @@
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/pair.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup reductions
  *  \{
@@ -58,7 +56,7 @@ namespace thrust
  *  \return The result of the reduction.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *          and if \c x and \c y are objects of \p InputIterator's \c value_type,
  *          then <tt>x + y</tt> is defined and is convertible to \p InputIterator's
  *          \c value_type. If \c T is \c InputIterator's \c value_type, then
@@ -77,7 +75,7 @@ namespace thrust
  *  // result == 9
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/accumulate.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/accumulate
  */
 template<typename DerivedPolicy, typename InputIterator>
 __host__ __device__
@@ -104,7 +102,7 @@ __host__ __device__
  *  \param last The end of the sequence.
  *  \return The result of the reduction.
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *          and if \c x and \c y are objects of \p InputIterator's \c value_type,
  *          then <tt>x + y</tt> is defined and is convertible to \p InputIterator's
  *          \c value_type. If \c T is \c InputIterator's \c value_type, then
@@ -122,7 +120,7 @@ __host__ __device__
  *  // result == 9
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/accumulate.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/accumulate
  */
 template<typename InputIterator> typename
   thrust::iterator_traits<InputIterator>::value_type reduce(InputIterator first, InputIterator last);
@@ -152,7 +150,7 @@ template<typename InputIterator> typename
  *  \return The result of the reduction.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *          and if \c x and \c y are objects of \p InputIterator's \c value_type,
  *          then <tt>x + y</tt> is defined and is convertible to \p T.
  *  \tparam T is convertible to \p InputIterator's \c value_type.
@@ -171,7 +169,7 @@ template<typename InputIterator> typename
  *  // result == 10
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/accumulate.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/accumulate
  */
 template<typename DerivedPolicy, typename InputIterator, typename T>
 __host__ __device__
@@ -201,7 +199,7 @@ __host__ __device__
  *  \param init The initial value.
  *  \return The result of the reduction.
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *          and if \c x and \c y are objects of \p InputIterator's \c value_type,
  *          then <tt>x + y</tt> is defined and is convertible to \p T.
  *  \tparam T is convertible to \p InputIterator's \c value_type.
@@ -218,7 +216,7 @@ __host__ __device__
  *  // result == 10
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/accumulate.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/accumulate
  */
 template<typename InputIterator, typename T>
   T reduce(InputIterator first,
@@ -251,11 +249,11 @@ template<typename InputIterator, typename T>
  *  \return The result of the reduction.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *          and \c InputIterator's \c value_type is convertible to \c T.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>,
  *          and is convertible to \p BinaryFunction's \c first_argument_type and \c second_argument_type.
- *  \tparam BinaryFunction is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>,
+ *  \tparam BinaryFunction is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/binary_function">Binary Function</a>,
  *          and \p BinaryFunction's \c result_type is convertible to \p OutputType.
  *
  *  The following code snippet demonstrates how to use \p reduce to
@@ -275,7 +273,7 @@ template<typename InputIterator, typename T>
  *  // result == 3
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/accumulate.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/accumulate
  *  \see transform_reduce
  */
 template<typename DerivedPolicy,
@@ -311,11 +309,11 @@ __host__ __device__
  *  \param binary_op The binary function used to 'sum' values.
  *  \return The result of the reduction.
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *          and \c InputIterator's \c value_type is convertible to \c T.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>,
  *          and is convertible to \p BinaryFunction's \c first_argument_type and \c second_argument_type.
- *  \tparam BinaryFunction is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>,
+ *  \tparam BinaryFunction is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/binary_function">Binary Function</a>,
  *          and \p BinaryFunction's \c result_type is convertible to \p OutputType.
  *
  *  The following code snippet demonstrates how to use \p reduce to
@@ -332,7 +330,7 @@ __host__ __device__
  *  // result == 3
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/accumulate.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/accumulate
  *  \see transform_reduce
  */
 template<typename InputIterator,
@@ -364,11 +362,11 @@ template<typename InputIterator,
  *  \return A pair of iterators at end of the ranges <tt>[keys_output, keys_output_last)</tt> and <tt>[values_output, values_output_last)</tt>.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a> and
  *          and \p InputIterator1's \c value_type is convertible to \c OutputIterator1's \c value_type.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a> and
  *          and \p InputIterator2's \c value_type is convertible to \c OutputIterator2's \c value_type.
  *
  *  \pre The input ranges shall not overlap either output range.
@@ -430,11 +428,11 @@ __host__ __device__
  *  \param values_output The beginning of the output value range.
  *  \return A pair of iterators at end of the ranges <tt>[keys_output, keys_output_last)</tt> and <tt>[values_output, values_output_last)</tt>.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a> and
  *          and \p InputIterator1's \c value_type is convertible to \c OutputIterator1's \c value_type.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a> and
  *          and \p InputIterator2's \c value_type is convertible to \c OutputIterator2's \c value_type.
  *
  *  \pre The input ranges shall not overlap either output range.
@@ -496,13 +494,13 @@ template<typename InputIterator1,
  *  \return A pair of iterators at end of the ranges <tt>[keys_output, keys_output_last)</tt> and <tt>[values_output, values_output_last)</tt>.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a> and
  *          and \p InputIterator1's \c value_type is convertible to \c OutputIterator1's \c value_type.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a> and
  *          and \p InputIterator2's \c value_type is convertible to \c OutputIterator2's \c value_type.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *  \tparam BinaryPredicate is a model of <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">Binary Predicate</a>.
  *
  *  \pre The input ranges shall not overlap either output range.
  *
@@ -567,13 +565,13 @@ __host__ __device__
  *  \param binary_pred  The binary predicate used to determine equality.
  *  \return A pair of iterators at end of the ranges <tt>[keys_output, keys_output_last)</tt> and <tt>[values_output, values_output_last)</tt>.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a> and
  *          and \p InputIterator1's \c value_type is convertible to \c OutputIterator1's \c value_type.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a> and
  *          and \p InputIterator2's \c value_type is convertible to \c OutputIterator2's \c value_type.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *  \tparam BinaryPredicate is a model of <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">Binary Predicate</a>.
  *
  *  \pre The input ranges shall not overlap either output range.
  *
@@ -641,14 +639,14 @@ template<typename InputIterator1,
  *  \return A pair of iterators at end of the ranges <tt>[keys_output, keys_output_last)</tt> and <tt>[values_output, values_output_last)</tt>.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a> and
  *          and \p InputIterator1's \c value_type is convertible to \c OutputIterator1's \c value_type.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a> and
  *          and \p InputIterator2's \c value_type is convertible to \c OutputIterator2's \c value_type.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
- *  \tparam BinaryFunction is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
+ *  \tparam BinaryPredicate is a model of <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">Binary Predicate</a>.
+ *  \tparam BinaryFunction is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/binary_function">Binary Function</a>
  *          and \c BinaryFunction's \c result_type is convertible to \c OutputIterator2's \c value_type.
  *
  *  \pre The input ranges shall not overlap either output range.
@@ -721,14 +719,14 @@ __host__ __device__
  *  \param binary_op The binary function used to accumulate values.
  *  \return A pair of iterators at end of the ranges <tt>[keys_output, keys_output_last)</tt> and <tt>[values_output, values_output_last)</tt>.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a> and
  *          and \p InputIterator1's \c value_type is convertible to \c OutputIterator1's \c value_type.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a> and
  *          and \p InputIterator2's \c value_type is convertible to \c OutputIterator2's \c value_type.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
- *  \tparam BinaryFunction is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
+ *  \tparam BinaryPredicate is a model of <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">Binary Predicate</a>.
+ *  \tparam BinaryFunction is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/binary_function">Binary Function</a>
  *          and \c BinaryFunction's \c result_type is convertible to \c OutputIterator2's \c value_type.
  *
  *  \pre The input ranges shall not overlap either output range.
@@ -778,8 +776,6 @@ template<typename InputIterator1,
 /*! \} // end reductions
  */
 
-
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/reduce.inl>
-
diff --git a/thrust/remove.h b/thrust/remove.h
index 7e8ec41a6..a57fcf211 100644
--- a/thrust/remove.h
+++ b/thrust/remove.h
@@ -24,9 +24,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/detail/execution_policy.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup stream_compaction Stream Compaction
  *  \ingroup reordering
@@ -54,9 +52,9 @@ namespace thrust
  *          elements which are not equal to \p value.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator is mutable.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>,
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/concepts/equality_comparable">Equality Comparable</a>,
  *          and objects of type \p T can be compared for equality with objects of \p ForwardIterator's \c value_type.
  *
  *  The following code snippet demonstrates how to use \p remove to remove a number
@@ -82,12 +80,12 @@ namespace thrust
  *  range after elements have been removed from it; it follows that the elements
  *  after that iterator are of no interest, and may be discarded. If you are
  *  removing elements from a
- *  <a href="http://www.sgi.com/tech/stl/Sequence.html">Sequence</a>, you may
+ *  <a href="https://en.cppreference.com/w/cpp/container">Sequence</a>, you may
  *  simply erase them. That is, a reasonable way of removing elements from a
- *  <a href="http://www.sgi.com/tech/stl/Sequence.html">Sequence</a> is
+ *  <a href="https://en.cppreference.com/w/cpp/container">Sequence</a> is
  *  <tt>S.erase(remove(S.begin(), S.end(), x), S.end())</tt>.
  *
- *  \see http://www.sgi.com/tech/stl/remove.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/remove
  *  \see remove_if
  *  \see remove_copy
  *  \see remove_copy_if
@@ -117,9 +115,9 @@ __host__ __device__
  *  \return A \p ForwardIterator pointing to the end of the resulting range of
  *          elements which are not equal to \p value.
  *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator is mutable.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>,
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/concepts/equality_comparable">Equality Comparable</a>,
  *          and objects of type \p T can be compared for equality with objects of \p ForwardIterator's \c value_type.
  *
  *  The following code snippet demonstrates how to use \p remove to remove a number
@@ -144,12 +142,12 @@ __host__ __device__
  *  range after elements have been removed from it; it follows that the elements
  *  after that iterator are of no interest, and may be discarded. If you are
  *  removing elements from a
- *  <a href="http://www.sgi.com/tech/stl/Sequence.html">Sequence</a>, you may
+ *  <a href="https://en.cppreference.com/w/cpp/container">Sequence</a>, you may
  *  simply erase them. That is, a reasonable way of removing elements from a
- *  <a href="http://www.sgi.com/tech/stl/Sequence.html">Sequence</a> is
+ *  <a href="https://en.cppreference.com/w/cpp/container">Sequence</a> is
  *  <tt>S.erase(remove(S.begin(), S.end(), x), S.end())</tt>.
  *
- *  \see http://www.sgi.com/tech/stl/remove.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/remove
  *  \see remove_if
  *  \see remove_copy
  *  \see remove_copy_if
@@ -179,10 +177,10 @@ template<typename ForwardIterator,
  *          which are not equal to \p value.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator's \c value_type is convertible to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable">Equality Comparable</a>,
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/concepts/equality_comparable">Equality Comparable</a>,
  *          and objects of type \p T can be compared for equality with objects of \p InputIterator's \c value_type.
  *
  *  \pre The range <tt>[first, last)</tt> shall not overlap the range <tt>[result, result + (last - first))</tt>.
@@ -203,7 +201,7 @@ template<typename ForwardIterator,
  *  // result is now {-2, -1, 1, 2}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/remove_copy.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/remove_copy
  *  \see remove
  *  \see remove_if
  *  \see remove_copy_if
@@ -234,10 +232,10 @@ __host__ __device__
  *  \return An OutputIterator pointing to the end of the resulting range of elements
  *          which are not equal to \p value.
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator's \c value_type is convertible to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable">Equality Comparable</a>,
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/concepts/equality_comparable">Equality Comparable</a>,
  *          and objects of type \p T can be compared for equality with objects of \p InputIterator's \c value_type.
  *
  *  \pre The range <tt>[first, last)</tt> shall not overlap the range <tt>[result, result + (last - first))</tt>.
@@ -256,7 +254,7 @@ __host__ __device__
  *  // result is now {-2, -1, 1, 2}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/remove_copy.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/remove_copy
  *  \see remove
  *  \see remove_if
  *  \see remove_copy_if
@@ -290,10 +288,10 @@ template<typename InputIterator,
  *          elements for which \p pred evaluated to \c true.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          \p ForwardIterator is mutable,
  *          and \p ForwardIterator's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/Predicate.html">Predicate</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  The following code snippet demonstrates how to use \p remove_if to remove
  *  all even numbers from an array of integers using the \p thrust::host execution policy for
@@ -329,12 +327,12 @@ template<typename InputIterator,
  *  range after elements have been removed from it; it follows that the elements
  *  after that iterator are of no interest, and may be discarded. If you are
  *  removing elements from a
- *  <a href="http://www.sgi.com/tech/stl/Sequence.html">Sequence</a>, you may
+ *  <a href="https://en.cppreference.com/w/cpp/container">Sequence</a>, you may
  *  simply erase them. That is, a reasonable way of removing elements from a
- *  <a href="http://www.sgi.com/tech/stl/Sequence.html">Sequence</a> is
+ *  <a href="https://en.cppreference.com/w/cpp/container">Sequence</a> is
  *  <tt>S.erase(remove_if(S.begin(), S.end(), pred), S.end())</tt>.
  *
- *  \see http://www.sgi.com/tech/stl/remove_if.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/remove
  *  \see remove
  *  \see remove_copy
  *  \see remove_copy_if
@@ -365,10 +363,10 @@ __host__ __device__
  *  \return A ForwardIterator pointing to the end of the resulting range of
  *          elements for which \p pred evaluated to \c true.
  *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          \p ForwardIterator is mutable,
  *          and \p ForwardIterator's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/Predicate.html">Predicate</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  The following code snippet demonstrates how to use \p remove_if to remove
  *  all even numbers from an array of integers.
@@ -402,12 +400,12 @@ __host__ __device__
  *  range after elements have been removed from it; it follows that the elements
  *  after that iterator are of no interest, and may be discarded. If you are
  *  removing elements from a
- *  <a href="http://www.sgi.com/tech/stl/Sequence.html">Sequence</a>, you may
+ *  <a href="https://en.cppreference.com/w/cpp/container">Sequence</a>, you may
  *  simply erase them. That is, a reasonable way of removing elements from a
- *  <a href="http://www.sgi.com/tech/stl/Sequence.html">Sequence</a> is
+ *  <a href="https://en.cppreference.com/w/cpp/container">Sequence</a> is
  *  <tt>S.erase(remove_if(S.begin(), S.end(), pred), S.end())</tt>.
  *
- *  \see http://www.sgi.com/tech/stl/remove_if.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/remove
  *  \see remove
  *  \see remove_copy
  *  \see remove_copy_if
@@ -438,11 +436,11 @@ template<typename ForwardIterator,
  *  \return An OutputIterator pointing to the end of the resulting range.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator's \c value_type is convertible to a type in \p OutputIterator's set of \c value_types,
  *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \pre The range <tt>[first, last)</tt> shall not overlap the range <tt>[result, result + (last - first))</tt>.
  *
@@ -471,7 +469,7 @@ template<typename ForwardIterator,
  *  // result is now {-1, 1}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/remove_copy_if.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/remove_copy
  *  \see remove
  *  \see remove_copy
  *  \see remove_if
@@ -503,11 +501,11 @@ __host__ __device__
  *              to the resulting sequence.
  *  \return An OutputIterator pointing to the end of the resulting range.
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator's \c value_type is convertible to a type in \p OutputIterator's set of \c value_types,
  *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \pre The range <tt>[first, last)</tt> shall not overlap the range <tt>[result, result + (last - first))</tt>.
  *
@@ -534,7 +532,7 @@ __host__ __device__
  *  // result is now {-1, 1}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/remove_copy_if.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/remove_copy
  *  \see remove
  *  \see remove_copy
  *  \see remove_if
@@ -569,11 +567,11 @@ template<typename InputIterator,
  *          elements for which \p pred evaluated to \c true.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/ForwardIterator.html">Forward Iterator</a>
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>
  *          and \p ForwardIterator is mutable.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/Predicate.html">Predicate</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \pre The range <tt>[first, last)</tt> shall not overlap the range <tt>[result, result + (last - first))</tt>.
  *  \pre The range <tt>[stencil, stencil + (last - first))</tt> shall not overlap the range <tt>[result, result + (last - first))</tt>.
@@ -597,7 +595,7 @@ template<typename InputIterator,
  *
  *  \note The range <tt>[first, last)</tt> is not permitted to overlap with the range <tt>[stencil, stencil + (last - first))</tt>.
  *
- *  \see http://www.sgi.com/tech/stl/remove_if.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/remove
  *  \see remove
  *  \see remove_copy
  *  \see remove_copy_if
@@ -631,11 +629,11 @@ __host__ __device__
  *  \return A ForwardIterator pointing to the end of the resulting range of
  *          elements for which \p pred evaluated to \c true.
  *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/ForwardIterator.html">Forward Iterator</a>
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>
  *          and \p ForwardIterator is mutable.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/Predicate.html">Predicate</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \pre The range <tt>[first, last)</tt> shall not overlap the range <tt>[result, result + (last - first))</tt>.
  *  \pre The range <tt>[stencil, stencil + (last - first))</tt> shall not overlap the range <tt>[result, result + (last - first))</tt>.
@@ -657,7 +655,7 @@ __host__ __device__
  *
  *  \note The range <tt>[first, last)</tt> is not permitted to overlap with the range <tt>[stencil, stencil + (last - first))</tt>.
  *
- *  \see http://www.sgi.com/tech/stl/remove_if.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/remove
  *  \see remove
  *  \see remove_copy
  *  \see remove_copy_if
@@ -692,12 +690,12 @@ template<typename ForwardIterator,
  *  \return An OutputIterator pointing to the end of the resulting range.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1's \c value_type is convertible to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator2's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \pre The range <tt>[stencil, stencil + (last - first))</tt> shall not overlap the range <tt>[result, result + (last - first))</tt>.
  *
@@ -718,7 +716,7 @@ template<typename ForwardIterator,
  *  // result is now {-1, 1}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/remove_copy_if.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/remove_copy
  *  \see remove
  *  \see remove_copy
  *  \see remove_if
@@ -755,12 +753,12 @@ __host__ __device__
  *              to the resulting sequence.
  *  \return An OutputIterator pointing to the end of the resulting range.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1's \c value_type is convertible to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator2's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \pre The range <tt>[stencil, stencil + (last - first))</tt> shall not overlap the range <tt>[result, result + (last - first))</tt>.
  *
@@ -779,7 +777,7 @@ __host__ __device__
  *  // result is now {-1, 1}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/remove_copy_if.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/remove_copy
  *  \see remove
  *  \see remove_copy
  *  \see remove_if
@@ -799,8 +797,6 @@ template<typename InputIterator1,
 /*! \} // end stream_compaction
  */
 
-
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/remove.inl>
-
diff --git a/thrust/replace.h b/thrust/replace.h
index 225cb060a..a5c0320c4 100644
--- a/thrust/replace.h
+++ b/thrust/replace.h
@@ -24,9 +24,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/detail/execution_policy.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup transformations
  *  \addtogroup replacing
@@ -48,10 +46,10 @@ namespace thrust
  *  \param new_value The new value to replace \p old_value.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator is mutable.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html>Assignable.html">Assignable</a>,
- *          \p T is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">EqualityComparable</a>,
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable>Assignable">Assignable</a>,
+ *          \p T is a model of <a href="https://en.cppreference.com/w/cpp/concepts/equality_comparable">EqualityComparable</a>,
  *          objects of \p T may be compared for equality with objects of
  *          \p ForwardIterator's \c value_type,
  *          and \p T is convertible to \p ForwardIterator's \c value_type.
@@ -78,7 +76,7 @@ namespace thrust
  *  // A contains [99, 2, 3, 99]
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/replace.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/replace
  *  \see \c replace_if
  *  \see \c replace_copy
  *  \see \c replace_copy_if
@@ -100,10 +98,10 @@ __host__ __device__
  *  \param old_value The value to replace.
  *  \param new_value The new value to replace \p old_value.
  *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator is mutable.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html>Assignable.html">Assignable</a>,
- *          \p T is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">EqualityComparable</a>,
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable>Assignable">Assignable</a>,
+ *          \p T is a model of <a href="https://en.cppreference.com/w/cpp/concepts/equality_comparable">EqualityComparable</a>,
  *          objects of \p T may be compared for equality with objects of
  *          \p ForwardIterator's \c value_type,
  *          and \p T is convertible to \p ForwardIterator's \c value_type.
@@ -128,7 +126,7 @@ __host__ __device__
  *  // A contains [99, 2, 3, 99]
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/replace.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/replace
  *  \see \c replace_if
  *  \see \c replace_copy
  *  \see \c replace_copy_if
@@ -152,11 +150,11 @@ template<typename ForwardIterator, typename T>
  *         to \c true.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          \p ForwardIterator is mutable,
  *          and \p ForwardIterator's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>,
  *          and \p T is convertible to \p ForwardIterator's \c value_type.
  *
  *  The following code snippet demonstrates how to use \p replace_if to replace
@@ -192,7 +190,7 @@ template<typename ForwardIterator, typename T>
  *  // A contains [1, 0, 2, 0]
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/replace_if.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/replace
  *  \see \c replace
  *  \see \c replace_copy
  *  \see \c replace_copy_if
@@ -215,11 +213,11 @@ __host__ __device__
  *  \param new_value The new value to replace elements which <tt>pred(*i)</tt> evaluates
  *         to \c true.
  *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          \p ForwardIterator is mutable,
  *          and \p ForwardIterator's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>,
  *          and \p T is convertible to \p ForwardIterator's \c value_type.
  *
  *  The following code snippet demonstrates how to use \p replace_if to replace
@@ -253,7 +251,7 @@ __host__ __device__
  *  // A contains [1, 0, 2, 0]
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/replace_if.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/replace
  *  \see \c replace
  *  \see \c replace_copy
  *  \see \c replace_copy_if
@@ -280,12 +278,12 @@ template<typename ForwardIterator, typename Predicate, typename T>
  *         to \c true.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator is mutable.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>,
  *          and \p T is convertible to \p ForwardIterator's \c value_type.
  *
  *  The following code snippet demonstrates how to use \p replace_if to replace
@@ -326,7 +324,7 @@ template<typename ForwardIterator, typename Predicate, typename T>
  *  // A contains [0, 20, 0, 40]
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/replace_if.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/replace
  *  \see \c replace
  *  \see \c replace_copy
  *  \see \c replace_copy_if
@@ -352,12 +350,12 @@ __host__ __device__
  *  \param new_value The new value to replace elements which <tt>pred(*i)</tt> evaluates
  *         to \c true.
  *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator is mutable.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>,
  *          and \p T is convertible to \p ForwardIterator's \c value_type.
  *
  *  The following code snippet demonstrates how to use \p replace_if to replace
@@ -396,7 +394,7 @@ __host__ __device__
  *  // A contains [0, 20, 0, 40]
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/replace_if.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/replace
  *  \see \c replace
  *  \see \c replace_copy
  *  \see \c replace_copy_if
@@ -427,10 +425,10 @@ template<typename ForwardIterator, typename InputIterator, typename Predicate, t
  *  \return <tt>result + (last-first)</tt>
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
- *          \p T is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>,
+ *          \p T is a model of <a href="https://en.cppreference.com/w/cpp/concepts/equality_comparable">Equality Comparable</a>,
  *          \p T may be compared for equality with \p InputIterator's \c value_type,
  *          and \p T is convertible to \p OutputIterator's \c value_type.
  *
@@ -454,7 +452,7 @@ template<typename ForwardIterator, typename InputIterator, typename Predicate, t
  *  // B contains [99, 2, 3, 99]
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/replace_copy.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/replace_copy
  *  \see \c copy
  *  \see \c replace
  *  \see \c replace_if
@@ -484,10 +482,10 @@ __host__ __device__
  *  \param new_value The replacement value for which <tt>*i == old_value</tt> evaluates to \c true.
  *  \return <tt>result + (last-first)</tt>
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
- *          \p T is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>,
+ *          \p T is a model of <a href="https://en.cppreference.com/w/cpp/concepts/equality_comparable">Equality Comparable</a>,
  *          \p T may be compared for equality with \p InputIterator's \c value_type,
  *          and \p T is convertible to \p OutputIterator's \c value_type.
  *
@@ -510,7 +508,7 @@ __host__ __device__
  *  // B contains [99, 2, 3, 99]
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/replace_copy.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/replace_copy
  *  \see \c copy
  *  \see \c replace
  *  \see \c replace_if
@@ -541,11 +539,11 @@ template<typename InputIterator, typename OutputIterator, typename T>
  *  \return <tt>result + (last-first)</tt>
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>,
  *          and \p T is convertible to \p OutputIterator's \c value_type.
  *
  *  \pre \p first may equal \p result, but the ranges <tt>[first, last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
@@ -580,7 +578,7 @@ template<typename InputIterator, typename OutputIterator, typename T>
  *  // B contains [1, 0, 2, 0]
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/replace_copy_if.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/replace_copy
  *  \see \c replace
  *  \see \c replace_if
  *  \see \c replace_copy
@@ -609,11 +607,11 @@ __host__ __device__
  *  \param new_value The replacement value to assign <tt>pred(*i)</tt> evaluates to \c true.
  *  \return <tt>result + (last-first)</tt>
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>,
  *          and \p T is convertible to \p OutputIterator's \c value_type.
  *
  *  \pre \p first may equal \p result, but the ranges <tt>[first, last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
@@ -647,7 +645,7 @@ __host__ __device__
  *  // B contains [1, 0, 2, 0]
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/replace_copy_if.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/replace_copy
  *  \see \c replace
  *  \see \c replace_if
  *  \see \c replace_copy
@@ -679,12 +677,12 @@ template<typename InputIterator, typename OutputIterator, typename Predicate, ty
  *  \return <tt>result + (last-first)</tt>
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>.
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                         and \p InputIterator2's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>,
  *          and \p T is convertible to \p OutputIterator's \c value_type.
  *
  *  \pre \p first may equal \p result, but the ranges <tt>[first, last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
@@ -755,12 +753,12 @@ __host__ __device__
  *  \param new_value The replacement value to assign when <tt>pred(*s)</tt> evaluates to \c true. 
  *  \return <tt>result + (last-first)</tt>
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>.
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                         and \p InputIterator2's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>,
  *          and \p T is convertible to \p OutputIterator's \c value_type.
  *
  *  \pre \p first may equal \p result, but the ranges <tt>[first, last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
@@ -816,8 +814,6 @@ template<typename InputIterator1, typename InputIterator2, typename OutputIterat
  *  \} // transformations
  */
 
-
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/replace.inl>
-
diff --git a/thrust/reverse.h b/thrust/reverse.h
index 73bd9579f..056be200a 100644
--- a/thrust/reverse.h
+++ b/thrust/reverse.h
@@ -24,9 +24,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/detail/execution_policy.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup reordering
  *  \ingroup algorithms
@@ -44,7 +42,7 @@ namespace thrust
  *  \param last The end of the range to reverse.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam BidirectionalIterator is a model of <a href="http://www.sgi.com/tech/stl/BidirectionalIterator.html">Bidirectional Iterator</a> and
+ *  \tparam BidirectionalIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/bidirectional_iterator">Bidirectional Iterator</a> and
  *          \p BidirectionalIterator is mutable.
  *
  *  The following code snippet demonstrates how to use \p reverse to reverse a
@@ -62,7 +60,7 @@ namespace thrust
  *  // v is now {5, 4, 3, 2, 1, 0}
  *  \endcode
  *  
- *  \see http://www.sgi.com/tech/stl/reverse.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/reverse
  *  \see \p reverse_copy
  *  \see \p reverse_iterator
  */
@@ -80,7 +78,7 @@ __host__ __device__
  *  \param first The beginning of the range to reverse.
  *  \param last The end of the range to reverse.
  *
- *  \tparam BidirectionalIterator is a model of <a href="http://www.sgi.com/tech/stl/BidirectionalIterator.html">Bidirectional Iterator</a> and
+ *  \tparam BidirectionalIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/bidirectional_iterator">Bidirectional Iterator</a> and
  *          \p BidirectionalIterator is mutable.
  *
  *  The following code snippet demonstrates how to use \p reverse to reverse a
@@ -96,7 +94,7 @@ __host__ __device__
  *  // v is now {5, 4, 3, 2, 1, 0}
  *  \endcode
  *  
- *  \see http://www.sgi.com/tech/stl/reverse.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/reverse
  *  \see \p reverse_copy
  *  \see \p reverse_iterator
  */
@@ -124,9 +122,9 @@ template<typename BidirectionalIterator>
  *  \param result The beginning of the output range.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam BidirectionalIterator is a model of <a href="http://www.sgi.com/tech/stl/BidirectionalIterator.html">Bidirectional Iterator</a>,
+ *  \tparam BidirectionalIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/bidirectional_iterator">Bidirectional Iterator</a>,
  *          and \p BidirectionalIterator's \p value_type is convertible to \p OutputIterator's \p value_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *
  *  \pre The range <tt>[first, last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap.
  *
@@ -147,7 +145,7 @@ template<typename BidirectionalIterator>
  *  // output is now  {5, 4, 3, 2, 1, 0}
  *  \endcode
  *  
- *  \see http://www.sgi.com/tech/stl/reverse_copy.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/reverse_copy
  *  \see \p reverse
  *  \see \p reverse_iterator
  */
@@ -174,9 +172,9 @@ __host__ __device__
  *  \param last The end of the range to reverse.
  *  \param result The beginning of the output range.
  *
- *  \tparam BidirectionalIterator is a model of <a href="http://www.sgi.com/tech/stl/BidirectionalIterator.html">Bidirectional Iterator</a>,
+ *  \tparam BidirectionalIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/bidirectional_iterator">Bidirectional Iterator</a>,
  *          and \p BidirectionalIterator's \p value_type is convertible to \p OutputIterator's \p value_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *
  *  \pre The range <tt>[first, last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap.
  *
@@ -195,7 +193,7 @@ __host__ __device__
  *  // output is now  {5, 4, 3, 2, 1, 0}
  *  \endcode
  *  
- *  \see http://www.sgi.com/tech/stl/reverse_copy.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/reverse_copy
  *  \see \p reverse
  *  \see \p reverse_iterator
  */
@@ -208,8 +206,6 @@ template<typename BidirectionalIterator, typename OutputIterator>
 /*! \} // end reordering
  */
 
-
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/reverse.inl>
-
diff --git a/thrust/scan.h b/thrust/scan.h
index 5b79af048..9b3814223 100644
--- a/thrust/scan.h
+++ b/thrust/scan.h
@@ -24,9 +24,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/detail/execution_policy.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup algorithms
  */
@@ -46,12 +44,16 @@ namespace thrust
  *  This version of \p inclusive_scan assumes plus as the associative operator.  
  *  When the input and output sequences are the same, the scan is performed 
  *  in-place.
- 
+ *
  *  \p inclusive_scan is similar to \c std::partial_sum in the STL.  The primary
  *  difference between the two functions is that \c std::partial_sum guarantees
  *  a serial summation order, while \p inclusive_scan requires associativity of 
  *  the binary operation to parallelize the prefix sum.
  *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
  *  The algorithm's execution is parallelized as determined by \p exec.
  *    
  *  \param exec The execution policy to use for parallelization.
@@ -61,10 +63,10 @@ namespace thrust
  *  \return The end of the output sequence.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                        and \c InputIterator's \c value_type is convertible to
  *                        \c OutputIterator's \c value_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>,
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>,
  *                         and if \c x and \c y are objects of \c OutputIterator's
  *                         \c value_type, then <tt>x + y</tt> is defined. If \c T is
  *                         \c OutputIterator's \c value_type, then <tt>T(0)</tt> is
@@ -87,7 +89,7 @@ namespace thrust
  *  // data is now {1, 1, 3, 5, 6, 9}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/partial_sum.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/partial_sum
  *
  */
 template<typename DerivedPolicy,
@@ -108,21 +110,25 @@ __host__ __device__
  *  This version of \p inclusive_scan assumes plus as the associative operator.  
  *  When the input and output sequences are the same, the scan is performed 
  *  in-place.
- 
+ *
  *  \p inclusive_scan is similar to \c std::partial_sum in the STL.  The primary
  *  difference between the two functions is that \c std::partial_sum guarantees
  *  a serial summation order, while \p inclusive_scan requires associativity of 
  *  the binary operation to parallelize the prefix sum.
- *    
+ *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
  *  \param first The beginning of the input sequence.
  *  \param last The end of the input sequence.
  *  \param result The beginning of the output sequence.
  *  \return The end of the output sequence.
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                        and \c InputIterator's \c value_type is convertible to
  *                        \c OutputIterator's \c value_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>,
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>,
  *                         and if \c x and \c y are objects of \c OutputIterator's
  *                         \c value_type, then <tt>x + y</tt> is defined. If \c T is
  *                         \c OutputIterator's \c value_type, then <tt>T(0)</tt> is
@@ -142,7 +148,7 @@ __host__ __device__
  *  // data is now {1, 1, 3, 5, 6, 9}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/partial_sum.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/partial_sum
  *
  */
 template<typename InputIterator,
@@ -156,12 +162,16 @@ template<typename InputIterator,
  *  term 'inclusive' means that each result includes the corresponding
  *  input operand in the partial sum.  When the input and output sequences 
  *  are the same, the scan is performed in-place.
- *    
+ *
  *  \p inclusive_scan is similar to \c std::partial_sum in the STL.  The primary
  *  difference between the two functions is that \c std::partial_sum guarantees
  *  a serial summation order, while \p inclusive_scan requires associativity of 
  *  the binary operation to parallelize the prefix sum.
  *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
  *  The algorithm's execution is parallelized as determined by \p exec.
  *
  *  \param exec The execution policy to use for parallelization.
@@ -172,14 +182,14 @@ template<typename InputIterator,
  *  \return The end of the output sequence.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                        and \c InputIterator's \c value_type is convertible to
  *                        \c OutputIterator's \c value_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>
  *                         and \c OutputIterator's \c value_type is convertible to
  *                         both \c AssociativeOperator's \c first_argument_type and
  *                         \c second_argument_type.
- *  \tparam AssociativeOperator is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
+ *  \tparam AssociativeOperator is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/binary_function">Binary Function</a>
  *                              and \c AssociativeOperator's \c result_type is
  *                              convertible to \c OutputIterator's \c value_type.
  *
@@ -198,7 +208,7 @@ template<typename InputIterator,
  *  // data is now {-5, 0, 2, 2, 2, 4, 4, 4, 4, 8}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/partial_sum.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/partial_sum
  */
 template<typename DerivedPolicy,
          typename InputIterator,
@@ -222,20 +232,24 @@ __host__ __device__
  *  a serial summation order, while \p inclusive_scan requires associativity of 
  *  the binary operation to parallelize the prefix sum.
  *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
  *  \param first The beginning of the input sequence.
  *  \param last The end of the input sequence.
  *  \param result The beginning of the output sequence.
  *  \param binary_op The associatve operator used to 'sum' values.
  *  \return The end of the output sequence.
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                        and \c InputIterator's \c value_type is convertible to
  *                        \c OutputIterator's \c value_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>
  *                         and \c OutputIterator's \c value_type is convertible to
  *                         both \c AssociativeOperator's \c first_argument_type and
  *                         \c second_argument_type.
- *  \tparam AssociativeOperator is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
+ *  \tparam AssociativeOperator is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/binary_function">Binary Function</a>
  *                              and \c AssociativeOperator's \c result_type is
  *                              convertible to \c OutputIterator's \c value_type.
  *
@@ -253,7 +267,7 @@ __host__ __device__
  *  // data is now {-5, 0, 2, 2, 2, 4, 4, 4, 4, 8}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/partial_sum.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/partial_sum
  */
 template<typename InputIterator,
          typename OutputIterator,
@@ -273,6 +287,10 @@ template<typename InputIterator,
  *  associative operator and \c 0 as the initial value.  When the input and 
  *  output sequences are the same, the scan is performed in-place.
  *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
  *  The algorithm's execution is parallelized as determined by \p exec.
  *    
  *  \param exec The execution policy to use for parallelization.
@@ -282,10 +300,10 @@ template<typename InputIterator,
  *  \return The end of the output sequence.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                        and \c InputIterator's \c value_type is convertible to
  *                        \c OutputIterator's \c value_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>,
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>,
  *                         and if \c x and \c y are objects of \c OutputIterator's
  *                         \c value_type, then <tt>x + y</tt> is defined. If \c T is
  *                         \c OutputIterator's \c value_type, then <tt>T(0)</tt> is
@@ -308,7 +326,7 @@ template<typename InputIterator,
  *  // data is now {0, 1, 1, 3, 5, 6}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/partial_sum.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/partial_sum
  */
 template<typename DerivedPolicy,
          typename InputIterator,
@@ -328,16 +346,20 @@ __host__ __device__
  *  and so on. This version of \p exclusive_scan assumes plus as the 
  *  associative operator and \c 0 as the initial value.  When the input and 
  *  output sequences are the same, the scan is performed in-place.
- *    
+ *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
  *  \param first The beginning of the input sequence.
  *  \param last The end of the input sequence.
  *  \param result The beginning of the output sequence.
  *  \return The end of the output sequence.
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                        and \c InputIterator's \c value_type is convertible to
  *                        \c OutputIterator's \c value_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>,
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>,
  *                         and if \c x and \c y are objects of \c OutputIterator's
  *                         \c value_type, then <tt>x + y</tt> is defined. If \c T is
  *                         \c OutputIterator's \c value_type, then <tt>T(0)</tt> is
@@ -357,7 +379,7 @@ __host__ __device__
  *  // data is now {0, 1, 1, 3, 5, 6}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/partial_sum.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/partial_sum
  */
 template<typename InputIterator,
          typename OutputIterator>
@@ -375,6 +397,10 @@ template<typename InputIterator,
  *  operator but requires an initial value \p init.  When the input and 
  *  output sequences are the same, the scan is performed in-place.
  *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
  *  The algorithm's execution is parallelized as determined by \p exec.
  *
  *  \param exec The execution policy to use for parallelization.
@@ -385,10 +411,10 @@ template<typename InputIterator,
  *  \return The end of the output sequence.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                        and \c InputIterator's \c value_type is convertible to
  *                        \c OutputIterator's \c value_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>,
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>,
  *                         and if \c x and \c y are objects of \c OutputIterator's
  *                         \c value_type, then <tt>x + y</tt> is defined.
  *  \tparam T is convertible to \c OutputIterator's \c value_type.
@@ -409,7 +435,7 @@ template<typename InputIterator,
  *  // data is now {4, 5, 5, 7, 9, 10}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/partial_sum.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/partial_sum
  */
 template<typename DerivedPolicy,
          typename InputIterator,
@@ -432,16 +458,20 @@ __host__ __device__
  *  operator but requires an initial value \p init.  When the input and 
  *  output sequences are the same, the scan is performed in-place.
  *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
  *  \param first The beginning of the input sequence.
  *  \param last The end of the input sequence.
  *  \param result The beginning of the output sequence.
  *  \param init The initial value.
  *  \return The end of the output sequence.
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                        and \c InputIterator's \c value_type is convertible to
  *                        \c OutputIterator's \c value_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>,
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>,
  *                         and if \c x and \c y are objects of \c OutputIterator's
  *                         \c value_type, then <tt>x + y</tt> is defined.
  *  \tparam T is convertible to \c OutputIterator's \c value_type.
@@ -460,7 +490,7 @@ __host__ __device__
  *  // data is now {4, 5, 5, 7, 9, 10}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/partial_sum.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/partial_sum
  */
 template<typename InputIterator,
          typename OutputIterator,
@@ -480,6 +510,10 @@ template<typename InputIterator,
  *  operator and an initial value \p init.  When the input and output
  *  sequences are the same, the scan is performed in-place.
  *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
  *  The algorithm's execution is parallelized as determined by \p exec.
  *    
  *  \param exec The execution policy to use for parallelization.
@@ -491,15 +525,15 @@ template<typename InputIterator,
  *  \return The end of the output sequence.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                        and \c InputIterator's \c value_type is convertible to
  *                        \c OutputIterator's \c value_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>
  *                         and \c OutputIterator's \c value_type is convertible to
  *                         both \c AssociativeOperator's \c first_argument_type and
  *                         \c second_argument_type.
  *  \tparam T is convertible to \c OutputIterator's \c value_type.
- *  \tparam AssociativeOperator is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
+ *  \tparam AssociativeOperator is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/binary_function">Binary Function</a>
  *                              and \c AssociativeOperator's \c result_type is
  *                              convertible to \c OutputIterator's \c value_type.
  *
@@ -523,7 +557,7 @@ template<typename InputIterator,
  *  // data is now {1, 1, 1, 2, 2, 2, 4, 4, 4, 4 }
  *  \endcode
  *  
- *  \see http://www.sgi.com/tech/stl/partial_sum.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/partial_sum
  */
 template<typename DerivedPolicy,
          typename InputIterator,
@@ -547,7 +581,11 @@ __host__ __device__
  *  and so on. This version of the function requires both an associative 
  *  operator and an initial value \p init.  When the input and output
  *  sequences are the same, the scan is performed in-place.
- *    
+ *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
  *  \param first The beginning of the input sequence.
  *  \param last The end of the input sequence.
  *  \param result The beginning of the output sequence.
@@ -555,15 +593,15 @@ __host__ __device__
  *  \param binary_op The associatve operator used to 'sum' values.
  *  \return The end of the output sequence.
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                        and \c InputIterator's \c value_type is convertible to
  *                        \c OutputIterator's \c value_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>
  *                         and \c OutputIterator's \c value_type is convertible to
  *                         both \c AssociativeOperator's \c first_argument_type and
  *                         \c second_argument_type.
  *  \tparam T is convertible to \c OutputIterator's \c value_type.
- *  \tparam AssociativeOperator is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
+ *  \tparam AssociativeOperator is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/binary_function">Binary Function</a>
  *                              and \c AssociativeOperator's \c result_type is
  *                              convertible to \c OutputIterator's \c value_type.
  *
@@ -584,7 +622,7 @@ __host__ __device__
  *  // data is now {1, 1, 1, 2, 2, 2, 4, 4, 4, 4 }
  *  \endcode
  *  
- *  \see http://www.sgi.com/tech/stl/partial_sum.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/partial_sum
  */
 template<typename InputIterator,
          typename OutputIterator,
@@ -620,6 +658,10 @@ template<typename InputIterator,
  *  operator used to perform the prefix sum. When the input and output sequences
  *  are the same, the scan is performed in-place.
  *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
  *  The algorithm's execution is parallelized as determined by \p exec.
  *
  *  \param exec The execution policy to use for parallelization.
@@ -630,10 +672,10 @@ template<typename InputIterator,
  *  \return The end of the output sequence.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                         and \c InputIterator2's \c value_type is convertible to \c OutputIterator's \c value_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>,
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>,
  *                         and if \c x and \c y are objects of \c OutputIterator's \c value_type, then 
  *                         <tt>binary_op(x,y)</tt> is defined.
  *
@@ -689,16 +731,20 @@ __host__ __device__
  *  operator used to perform the prefix sum. When the input and output sequences
  *  are the same, the scan is performed in-place.
  *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
  *  \param first1 The beginning of the key sequence.
  *  \param last1 The end of the key sequence.
  *  \param first2 The beginning of the input value sequence.
  *  \param result The beginning of the output value sequence.
  *  \return The end of the output sequence.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                         and \c InputIterator2's \c value_type is convertible to \c OutputIterator's \c value_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>,
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>,
  *                         and if \c x and \c y are objects of \c OutputIterator's \c value_type, then 
  *                         <tt>binary_op(x,y)</tt> is defined.
  *
@@ -748,6 +794,10 @@ template<typename InputIterator1,
  *  operator used to perform the prefix sum. When the input and output sequences
  *  are the same, the scan is performed in-place.
  *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
  *  The algorithm's execution is parallelized as determined by \p exec. 
  *
  *  \param exec The execution policy to use for parallelization.
@@ -759,13 +809,13 @@ template<typename InputIterator1,
  *  \return The end of the output sequence.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                         and \c InputIterator2's \c value_type is convertible to \c OutputIterator's \c value_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>,
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>,
  *                         and if \c x and \c y are objects of \c OutputIterator's \c value_type, then 
  *                         <tt>binary_op(x,y)</tt> is defined.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *  \tparam BinaryPredicate is a model of <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">Binary Predicate</a>.
  *
  *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
  *  \pre \p first2 may equal \p result but the range <tt>[first2, first2 + (last1 - first1)</tt> and range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
@@ -824,6 +874,10 @@ __host__ __device__
  *  operator used to perform the prefix sum. When the input and output sequences
  *  are the same, the scan is performed in-place.
  *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
  *  \param first1 The beginning of the key sequence.
  *  \param last1 The end of the key sequence.
  *  \param first2 The beginning of the input value sequence.
@@ -831,13 +885,13 @@ __host__ __device__
  *  \param binary_pred  The binary predicate used to determine equality of keys.
  *  \return The end of the output sequence.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                         and \c InputIterator2's \c value_type is convertible to \c OutputIterator's \c value_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>,
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>,
  *                         and if \c x and \c y are objects of \c OutputIterator's \c value_type, then 
  *                         <tt>binary_op(x,y)</tt> is defined.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *  \tparam BinaryPredicate is a model of <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">Binary Predicate</a>.
  *
  *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
  *  \pre \p first2 may equal \p result but the range <tt>[first2, first2 + (last1 - first1)</tt> and range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
@@ -890,6 +944,10 @@ template<typename InputIterator1,
  *  \c binary_op to perform the prefix sum. When the input and output sequences
  *  are the same, the scan is performed in-place.
  *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
  *  The algorithm's execution is parallelized as determined by \p exec.
  *
  *  \param exec The execution policy to use for parallelization.
@@ -902,14 +960,14 @@ template<typename InputIterator1,
  *  \return The end of the output sequence.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                         and \c InputIterator2's \c value_type is convertible to \c OutputIterator's \c value_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>,
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>,
  *                         and if \c x and \c y are objects of \c OutputIterator's \c value_type, then 
  *                         <tt>binary_op(x,y)</tt> is defined.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
- *  \tparam AssociativeOperator is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
+ *  \tparam BinaryPredicate is a model of <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">Binary Predicate</a>.
+ *  \tparam AssociativeOperator is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/binary_function">Binary Function</a>
  *                              and \c AssociativeOperator's \c result_type is
  *                              convertible to \c OutputIterator's \c value_type.
  *
@@ -969,6 +1027,10 @@ __host__ __device__
  *  belong to the same segment if <tt>binary_pred(*i, *(i+1))</tt> is true, and belong to 
  *  different segments otherwise.
  *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
  *  This version of \p inclusive_scan_by_key uses the associative operator 
  *  \c binary_op to perform the prefix sum. When the input and output sequences
  *  are the same, the scan is performed in-place.
@@ -981,14 +1043,14 @@ __host__ __device__
  *  \param binary_op The associatve operator used to 'sum' values.
  *  \return The end of the output sequence.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                         and \c InputIterator2's \c value_type is convertible to \c OutputIterator's \c value_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>,
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>,
  *                         and if \c x and \c y are objects of \c OutputIterator's \c value_type, then 
  *                         <tt>binary_op(x,y)</tt> is defined.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
- *  \tparam AssociativeOperator is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
+ *  \tparam BinaryPredicate is a model of <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">Binary Predicate</a>.
+ *  \tparam AssociativeOperator is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/binary_function">Binary Function</a>
  *                              and \c AssociativeOperator's \c result_type is
  *                              convertible to \c OutputIterator's \c value_type.
  *
@@ -1044,6 +1106,10 @@ template<typename InputIterator1,
  *  belong to the same segment if <tt>*i == *(i+1)</tt>, and belong to 
  *  different segments otherwise.
  *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
  *  Refer to the most general form of \p exclusive_scan_by_key for additional details.
  *
  *  The algorithm's execution is parallelized as determined by \p exec.
@@ -1103,6 +1169,10 @@ __host__ __device__
  *  belong to the same segment if <tt>*i == *(i+1)</tt>, and belong to 
  *  different segments otherwise.
  *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
  *  Refer to the most general form of \p exclusive_scan_by_key for additional details.
  *
  *  \param first1 The beginning of the key sequence.
@@ -1148,6 +1218,10 @@ template<typename InputIterator1,
  *  This version of \p exclusive_scan_by_key uses the value \c init to
  *  initialize the exclusive scan operation.
  *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
  *  The algorithm's execution is parallelized as determined by \p exec.
  *
  *  \param exec The execution policy to use for parallelization.
@@ -1208,6 +1282,10 @@ __host__ __device__
  *  This version of \p exclusive_scan_by_key uses the value \c init to
  *  initialize the exclusive scan operation.
  *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
  *  \param first1 The beginning of the key sequence.
  *  \param last1 The end of the key sequence.
  *  \param first2 The beginning of the input value sequence.
@@ -1264,6 +1342,10 @@ template<typename InputIterator1,
  *  <tt>i+1</tt> in the range <tt>[first1, last1)</tt> belong to the same segment if
  *  <tt>binary_pred(*i, *(i+1))</tt> is true, and belong to different segments otherwise.
  *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
  *  The algorithm's execution is parallelized as determined by \p exec.
  *
  *  \param exec The execution policy to use for parallelization.
@@ -1334,6 +1416,10 @@ __host__ __device__
  *  <tt>i+1</tt> in the range <tt>[first1, last1)</tt> belong to the same segment if
  *  <tt>binary_pred(*i, *(i+1))</tt> is true, and belong to different segments otherwise.
  *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
  *  \param first1 The beginning of the key sequence.
  *  \param last1 The end of the key sequence.
  *  \param first2 The beginning of the input value sequence.
@@ -1399,6 +1485,10 @@ template<typename InputIterator1,
  *  \c binary_op to perform the prefix sum. When the input and output sequences
  *  are the same, the scan is performed in-place.
  *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
  *  The algorithm's execution is parallelized as determined by \p exec.
  *
  *  \param exec The execution policy to use for parallelization.
@@ -1412,15 +1502,15 @@ template<typename InputIterator1,
  *  \return The end of the output sequence.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                         and \c InputIterator2's \c value_type is convertible to \c OutputIterator's \c value_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>,
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>,
  *                         and if \c x and \c y are objects of \c OutputIterator's \c value_type, then 
  *                         <tt>binary_op(x,y)</tt> is defined.
  *  \tparam T is convertible to \c OutputIterator's \c value_type.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
- *  \tparam AssociativeOperator is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
+ *  \tparam BinaryPredicate is a model of <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">Binary Predicate</a>.
+ *  \tparam AssociativeOperator is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/binary_function">Binary Function</a>
  *                         and \c AssociativeOperator's \c result_type is convertible to \c OutputIterator's \c value_type.
  *
  *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
@@ -1489,6 +1579,10 @@ __host__ __device__
  *  \c binary_op to perform the prefix sum. When the input and output sequences
  *  are the same, the scan is performed in-place.
  *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
  *  \param first1 The beginning of the key sequence.
  *  \param last1 The end of the key sequence.
  *  \param first2 The beginning of the input value sequence.
@@ -1498,15 +1592,15 @@ __host__ __device__
  *  \param binary_op The associatve operator used to 'sum' values.
  *  \return The end of the output sequence.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                         and \c InputIterator2's \c value_type is convertible to \c OutputIterator's \c value_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>,
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>,
  *                         and if \c x and \c y are objects of \c OutputIterator's \c value_type, then 
  *                         <tt>binary_op(x,y)</tt> is defined.
  *  \tparam T is convertible to \c OutputIterator's \c value_type.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
- *  \tparam AssociativeOperator is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
+ *  \tparam BinaryPredicate is a model of <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">Binary Predicate</a>.
+ *  \tparam AssociativeOperator is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/binary_function">Binary Function</a>
  *                         and \c AssociativeOperator's \c result_type is convertible to \c OutputIterator's \c value_type.
  *
  *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
@@ -1557,8 +1651,6 @@ template<typename InputIterator1,
 /*! \} // end prefix sums
  */
 
-	
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/scan.inl>
-
diff --git a/thrust/scatter.h b/thrust/scatter.h
index baaf1e63b..b8b0bd84f 100644
--- a/thrust/scatter.h
+++ b/thrust/scatter.h
@@ -24,9 +24,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/detail/execution_policy.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup scattering
  *  \ingroup copying
@@ -50,9 +48,9 @@ namespace thrust
  *  \param result Destination of the source elements.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator1's \c value_type must be convertible to \c RandomAccessIterator's \c value_type.
- *  \tparam InputIterator2 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator2's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
- *  \tparam RandomAccessIterator must be a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access iterator</a>.
+ *  \tparam InputIterator1 must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a> and \c InputIterator1's \c value_type must be convertible to \c RandomAccessIterator's \c value_type.
+ *  \tparam InputIterator2 must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a> and \c InputIterator2's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
+ *  \tparam RandomAccessIterator must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/random_access_iterator">Random Access iterator</a>.
  *
  *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range `[first,last)` for all iterators `i` in the range `[map,map + (last - first))`.
  *
@@ -110,9 +108,9 @@ __host__ __device__
  *  \param map  Beginning of the sequence of output indices.
  *  \param result Destination of the source elements.
  *
- *  \tparam InputIterator1 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator1's \c value_type must be convertible to \c RandomAccessIterator's \c value_type.
- *  \tparam InputIterator2 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator2's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
- *  \tparam RandomAccessIterator must be a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access iterator</a>.
+ *  \tparam InputIterator1 must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a> and \c InputIterator1's \c value_type must be convertible to \c RandomAccessIterator's \c value_type.
+ *  \tparam InputIterator2 must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a> and \c InputIterator2's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
+ *  \tparam RandomAccessIterator must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/random_access_iterator">Random Access iterator</a>.
  *
  *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range `[first,last)` for all iterators `i` in the range `[map,map + (last - first))`.
  *
@@ -171,10 +169,10 @@ template<typename InputIterator1,
  *  \param output Beginning of the destination range.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator1's \c value_type must be convertible to \c RandomAccessIterator's \c value_type.
- *  \tparam InputIterator2 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator2's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
- *  \tparam InputIterator3 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator3's \c value_type must be convertible to \c bool.
- *  \tparam RandomAccessIterator must be a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access iterator</a>.
+ *  \tparam InputIterator1 must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a> and \c InputIterator1's \c value_type must be convertible to \c RandomAccessIterator's \c value_type.
+ *  \tparam InputIterator2 must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a> and \c InputIterator2's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
+ *  \tparam InputIterator3 must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a> and \c InputIterator3's \c value_type must be convertible to \c bool.
+ *  \tparam RandomAccessIterator must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/random_access_iterator">Random Access iterator</a>.
  *
  *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range `[first,last)` for all iterators `i` in the range `[map,map + (last - first))`.
  *
@@ -228,10 +226,10 @@ __host__ __device__
  *  \param stencil Beginning of the sequence of predicate values.
  *  \param output Beginning of the destination range.
  *
- *  \tparam InputIterator1 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator1's \c value_type must be convertible to \c RandomAccessIterator's \c value_type.
- *  \tparam InputIterator2 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator2's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
- *  \tparam InputIterator3 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator3's \c value_type must be convertible to \c bool.
- *  \tparam RandomAccessIterator must be a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access iterator</a>.
+ *  \tparam InputIterator1 must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a> and \c InputIterator1's \c value_type must be convertible to \c RandomAccessIterator's \c value_type.
+ *  \tparam InputIterator2 must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a> and \c InputIterator2's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
+ *  \tparam InputIterator3 must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a> and \c InputIterator3's \c value_type must be convertible to \c bool.
+ *  \tparam RandomAccessIterator must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/random_access_iterator">Random Access iterator</a>.
  *
  *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range `[first,last)` for all iterators `i` in the range `[map,map + (last - first))`.
  *
@@ -286,11 +284,11 @@ template<typename InputIterator1,
  *  \param pred Predicate to apply to the stencil values.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator1's \c value_type must be convertible to \c RandomAccessIterator's \c value_type.
- *  \tparam InputIterator2 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator2's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
- *  \tparam InputIterator3 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator3's \c value_type must be convertible to \c Predicate's \c argument_type.
- *  \tparam RandomAccessIterator must be a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access iterator</a>.
- *  \tparam Predicate must be a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam InputIterator1 must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a> and \c InputIterator1's \c value_type must be convertible to \c RandomAccessIterator's \c value_type.
+ *  \tparam InputIterator2 must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a> and \c InputIterator2's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
+ *  \tparam InputIterator3 must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a> and \c InputIterator3's \c value_type must be convertible to \c Predicate's \c argument_type.
+ *  \tparam RandomAccessIterator must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/random_access_iterator">Random Access iterator</a>.
+ *  \tparam Predicate must be a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range `[first,last)` for all iterators `i` in the range `[map,map + (last - first))`.
  *
@@ -359,11 +357,11 @@ __host__ __device__
  *  \param output Beginning of the destination range.
  *  \param pred Predicate to apply to the stencil values.
  *
- *  \tparam InputIterator1 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator1's \c value_type must be convertible to \c RandomAccessIterator's \c value_type.
- *  \tparam InputIterator2 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator2's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
- *  \tparam InputIterator3 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator3's \c value_type must be convertible to \c Predicate's \c argument_type.
- *  \tparam RandomAccessIterator must be a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access iterator</a>.
- *  \tparam Predicate must be a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam InputIterator1 must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a> and \c InputIterator1's \c value_type must be convertible to \c RandomAccessIterator's \c value_type.
+ *  \tparam InputIterator2 must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a> and \c InputIterator2's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
+ *  \tparam InputIterator3 must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a> and \c InputIterator3's \c value_type must be convertible to \c Predicate's \c argument_type.
+ *  \tparam RandomAccessIterator must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/random_access_iterator">Random Access iterator</a>.
+ *  \tparam Predicate must be a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range `[first,last)` for all iterators `i` in the range `[map,map + (last - first))`.
  *
@@ -416,8 +414,6 @@ template<typename InputIterator1,
 /*! \} // end scattering
  */
 
-
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/scatter.inl>
-
diff --git a/thrust/sequence.h b/thrust/sequence.h
index e92391f64..fb3959e3c 100644
--- a/thrust/sequence.h
+++ b/thrust/sequence.h
@@ -24,9 +24,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/detail/execution_policy.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup transformations
  *  \{
@@ -45,7 +43,7 @@ namespace thrust
  *  \param last The end of the sequence.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator is mutable,
  *          and if \c x and \c y are objects of \c ForwardIterator's \c value_type, then <tt>x + y</tt> is defined,
  *          and if \c T is \p ForwardIterator's \c value_type, then <tt>T(0)</tt> is defined.
@@ -66,7 +64,7 @@ namespace thrust
  *  \note Unlike the similar C++ STL function \c std::iota, \p sequence offers no
  *        guarantee on order of execution.
  *
- *  \see http://www.sgi.com/tech/stl/iota.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/iota
  */
 template<typename DerivedPolicy, typename ForwardIterator>
 __host__ __device__
@@ -83,7 +81,7 @@ __host__ __device__
  *  \param first The beginning of the sequence.
  *  \param last The end of the sequence.
  *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator is mutable,
  *          and if \c x and \c y are objects of \c ForwardIterator's \c value_type, then <tt>x + y</tt> is defined,
  *          and if \c T is \p ForwardIterator's \c value_type, then <tt>T(0)</tt> is defined.
@@ -103,7 +101,7 @@ __host__ __device__
  *  \note Unlike the similar C++ STL function \c std::iota, \p sequence offers no
  *        guarantee on order of execution.
  *
- *  \see http://www.sgi.com/tech/stl/iota.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/iota
  */
 template<typename ForwardIterator>
   void sequence(ForwardIterator first,
@@ -123,11 +121,11 @@ template<typename ForwardIterator>
  *  \param init The first value of the sequence of numbers.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator is mutable,
  *          and if \c x and \c y are objects of \c ForwardIterator's \c value_type, then <tt>x + y</tt> is defined,
  *          and if \c T is \p ForwardIterator's \c value_type, then <tt>T(0)</tt> is defined.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>,
  *          and \p T is convertible to \p ForwardIterator's \c value_type.
  *
  *  The following code snippet demonstrates how to use \p sequence to fill a range
@@ -147,7 +145,7 @@ template<typename ForwardIterator>
  *  \note Unlike the similar C++ STL function \c std::iota, \p sequence offers no
  *        guarantee on order of execution.
  *
- *  \see http://www.sgi.com/tech/stl/iota.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/iota
  */
 template<typename DerivedPolicy, typename ForwardIterator, typename T>
 __host__ __device__
@@ -166,11 +164,11 @@ __host__ __device__
  *  \param last The end of the sequence.
  *  \param init The first value of the sequence of numbers.
  *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator is mutable,
  *          and if \c x and \c y are objects of \c ForwardIterator's \c value_type, then <tt>x + y</tt> is defined,
  *          and if \c T is \p ForwardIterator's \c value_type, then <tt>T(0)</tt> is defined.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>,
  *          and \p T is convertible to \p ForwardIterator's \c value_type.
  *
  *  The following code snippet demonstrates how to use \p sequence to fill a range
@@ -188,7 +186,7 @@ __host__ __device__
  *  \note Unlike the similar C++ STL function \c std::iota, \p sequence offers no
  *        guarantee on order of execution.
  *
- *  \see http://www.sgi.com/tech/stl/iota.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/iota
  */
 template<typename ForwardIterator, typename T>
   void sequence(ForwardIterator first,
@@ -210,11 +208,11 @@ template<typename ForwardIterator, typename T>
  *  \param step The difference between consecutive elements.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator is mutable,
  *          and if \c x and \c y are objects of \c ForwardIterator's \c value_type, then <tt>x + y</tt> is defined,
  *          and if \c T is \p ForwardIterator's \c value_type, then <tt>T(0)</tt> is defined.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>,
  *          and \p T is convertible to \p ForwardIterator's \c value_type.
  *
  *  The following code snippet demonstrates how to use \p sequence to fill a range
@@ -234,7 +232,7 @@ template<typename ForwardIterator, typename T>
  *  \note Unlike the similar C++ STL function \c std::iota, \p sequence offers no
  *        guarantee on order of execution.
  *
- *  \see http://www.sgi.com/tech/stl/iota.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/iota
  */
 template<typename DerivedPolicy, typename ForwardIterator, typename T>
 __host__ __device__
@@ -255,11 +253,11 @@ __host__ __device__
  *  \param init The first value of the sequence of numbers
  *  \param step The difference between consecutive elements.
  *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator is mutable,
  *          and if \c x and \c y are objects of \c ForwardIterator's \c value_type, then <tt>x + y</tt> is defined,
  *          and if \c T is \p ForwardIterator's \c value_type, then <tt>T(0)</tt> is defined.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>,
  *          and \p T is convertible to \p ForwardIterator's \c value_type.
  *
  *  The following code snippet demonstrates how to use \p sequence to fill a range
@@ -277,7 +275,7 @@ __host__ __device__
  *  \note Unlike the similar C++ STL function \c std::iota, \p sequence offers no
  *        guarantee on order of execution.
  *
- *  \see http://www.sgi.com/tech/stl/iota.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/iota
  */
 template<typename ForwardIterator, typename T>
   void sequence(ForwardIterator first,
@@ -289,8 +287,7 @@ template<typename ForwardIterator, typename T>
 /*! \} // end transformations
  */
 
-
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/sequence.inl>
 
diff --git a/thrust/set_operations.h b/thrust/set_operations.h
index a51eaed43..65a48d1b6 100644
--- a/thrust/set_operations.h
+++ b/thrust/set_operations.h
@@ -25,9 +25,7 @@
 #include <thrust/detail/execution_policy.h>
 #include <thrust/pair.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup set_operations Set Operations
  *  \ingroup algorithms
@@ -61,17 +59,17 @@ namespace thrust
  *  \return The end of the output range.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator2's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *
  *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
  *  \pre The resulting range shall not overlap with either input range.
@@ -84,16 +82,16 @@ namespace thrust
  *  #include <thrust/set_operations.h>
  *  #include <thrust/execution_policy.h>
  *  ...
- *  int A1[6] = {0, 1, 3, 4, 5, 6, 9};
+ *  int A1[7] = {0, 1, 3, 4, 5, 6, 9};
  *  int A2[5] = {1, 3, 5, 7, 9};
  *
  *  int result[3];
  *
- *  int *result_end = thrust::set_difference(thrust::host, A1, A1 + 6, A2, A2 + 5, result);
+ *  int *result_end = thrust::set_difference(thrust::host, A1, A1 + 7, A2, A2 + 5, result);
  *  // result is now {0, 4, 6}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/set_difference.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/set_difference
  *  \see \p includes
  *  \see \p set_union
  *  \see \p set_intersection
@@ -136,17 +134,17 @@ __host__ __device__
  *  \param result The beginning of the output range.
  *  \return The end of the output range.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator2's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *
  *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
  *  \pre The resulting range shall not overlap with either input range.
@@ -157,16 +155,16 @@ __host__ __device__
  *  \code
  *  #include <thrust/set_operations.h>
  *  ...
- *  int A1[6] = {0, 1, 3, 4, 5, 6, 9};
+ *  int A1[7] = {0, 1, 3, 4, 5, 6, 9};
  *  int A2[5] = {1, 3, 5, 7, 9};
  *
  *  int result[3];
  *
- *  int *result_end = thrust::set_difference(A1, A1 + 6, A2, A2 + 5, result);
+ *  int *result_end = thrust::set_difference(A1, A1 + 7, A2, A2 + 5, result);
  *  // result is now {0, 4, 6}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/set_difference.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/set_difference
  *  \see \p includes
  *  \see \p set_union
  *  \see \p set_intersection
@@ -211,14 +209,14 @@ template<typename InputIterator1,
  *  \return The end of the output range.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1's \c value_type is convertable to \p StrictWeakCompare's \c first_argument_type.
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2's \c value_type is convertable to \p StrictWeakCompare's \c second_argument_type.
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam StrictWeakCompare is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to \p comp.
  *  \pre The resulting range shall not overlap with either input range.
@@ -232,16 +230,16 @@ template<typename InputIterator1,
  *  #include <thrust/functional.h>
  *  #include <thrust/execution_policy.h>
  *  ...
- *  int A1[6] = {9, 6, 5, 4, 3, 1, 0};
+ *  int A1[7] = {9, 6, 5, 4, 3, 1, 0};
  *  int A2[5] = {9, 7, 5, 3, 1};
  *
  *  int result[3];
  *
- *  int *result_end = thrust::set_difference(thrust::host, A1, A1 + 6, A2, A2 + 5, result, thrust::greater<int>());
+ *  int *result_end = thrust::set_difference(thrust::host, A1, A1 + 7, A2, A2 + 5, result, thrust::greater<int>());
  *  // result is now {6, 4, 0}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/set_difference.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/set_difference
  *  \see \p includes
  *  \see \p set_union
  *  \see \p set_intersection
@@ -287,14 +285,14 @@ __host__ __device__
  *  \param comp Comparison operator.
  *  \return The end of the output range.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1's \c value_type is convertable to \p StrictWeakCompare's \c first_argument_type.
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2's \c value_type is convertable to \p StrictWeakCompare's \c second_argument_type.
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam StrictWeakCompare is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to \p comp.
  *  \pre The resulting range shall not overlap with either input range.
@@ -306,16 +304,16 @@ __host__ __device__
  *  #include <thrust/set_operations.h>
  *  #include <thrust/functional.h>
  *  ...
- *  int A1[6] = {9, 6, 5, 4, 3, 1, 0};
+ *  int A1[7] = {9, 6, 5, 4, 3, 1, 0};
  *  int A2[5] = {9, 7, 5, 3, 1};
  *
  *  int result[3];
  *
- *  int *result_end = thrust::set_difference(A1, A1 + 6, A2, A2 + 5, result, thrust::greater<int>());
+ *  int *result_end = thrust::set_difference(A1, A1 + 7, A2, A2 + 5, result, thrust::greater<int>());
  *  // result is now {6, 4, 0}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/set_difference.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/set_difference
  *  \see \p includes
  *  \see \p set_union
  *  \see \p set_intersection
@@ -368,17 +366,17 @@ template<typename InputIterator1,
  *  \return The end of the output range.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator2's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *
  *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
  *  \pre The resulting range shall not overlap with either input range.
@@ -400,7 +398,7 @@ template<typename InputIterator1,
  *  // result is now {1, 3, 5}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/set_intersection.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/set_intersection
  *  \see \p includes
  *  \see \p set_union
  *  \see \p set_intersection
@@ -450,17 +448,17 @@ __host__ __device__
  *  \param result The beginning of the output range.
  *  \return The end of the output range.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator2's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *
  *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
  *  \pre The resulting range shall not overlap with either input range.
@@ -480,7 +478,7 @@ __host__ __device__
  *  // result is now {1, 3, 5}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/set_intersection.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/set_intersection
  *  \see \p includes
  *  \see \p set_union
  *  \see \p set_intersection
@@ -534,17 +532,17 @@ template<typename InputIterator1,
  *  \pre The resulting range shall not overlap with either input range.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator2's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *
  *  The following code snippet demonstrates how to use \p set_intersection to compute
  *  the set intersection of sets of integers sorted in descending order using the \p thrust::host execution
@@ -563,7 +561,7 @@ template<typename InputIterator1,
  *  // result is now {5, 3, 1}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/set_intersection.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/set_intersection
  *  \see \p includes
  *  \see \p set_union
  *  \see \p set_intersection
@@ -618,17 +616,17 @@ __host__ __device__
  *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to \p comp.
  *  \pre The resulting range shall not overlap with either input range.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator2's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *
  *  The following code snippet demonstrates how to use \p set_intersection to compute
  *  the set intersection of sets of integers sorted in descending order.
@@ -645,7 +643,7 @@ __host__ __device__
  *  // result is now {5, 3, 1}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/set_intersection.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/set_intersection
  *  \see \p includes
  *  \see \p set_union
  *  \see \p set_intersection
@@ -694,17 +692,17 @@ template<typename InputIterator1,
  *  \return The end of the output range.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator2's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *
  *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
  *  \pre The resulting range shall not overlap with either input range.
@@ -717,16 +715,16 @@ template<typename InputIterator1,
  *  #include <thrust/set_operations.h>
  *  #include <thrust/execution_policy.h>
  *  ...
- *  int A1[6] = {0, 1, 2, 2, 4, 6, 7};
+ *  int A1[7] = {0, 1, 2, 2, 4, 6, 7};
  *  int A2[5] = {1, 1, 2, 5, 8};
  *
  *  int result[6];
  *
- *  int *result_end = thrust::set_symmetric_difference(thrust::host, A1, A1 + 6, A2, A2 + 5, result);
+ *  int *result_end = thrust::set_symmetric_difference(thrust::host, A1, A1 + 7, A2, A2 + 5, result);
  *  // result = {0, 4, 5, 6, 7, 8}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/set_symmetric_difference.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/set_symmetric_difference
  *  \see \p merge
  *  \see \p includes
  *  \see \p set_difference
@@ -773,17 +771,17 @@ __host__ __device__
  *  \param result The beginning of the output range.
  *  \return The end of the output range.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator2's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *
  *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
  *  \pre The resulting range shall not overlap with either input range.
@@ -794,16 +792,16 @@ __host__ __device__
  *  \code
  *  #include <thrust/set_operations.h>
  *  ...
- *  int A1[6] = {0, 1, 2, 2, 4, 6, 7};
+ *  int A1[7] = {0, 1, 2, 2, 4, 6, 7};
  *  int A2[5] = {1, 1, 2, 5, 8};
  *
  *  int result[6];
  *
- *  int *result_end = thrust::set_symmetric_difference(A1, A1 + 6, A2, A2 + 5, result);
+ *  int *result_end = thrust::set_symmetric_difference(A1, A1 + 7, A2, A2 + 5, result);
  *  // result = {0, 4, 5, 6, 7, 8}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/set_symmetric_difference.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/set_symmetric_difference
  *  \see \p merge
  *  \see \p includes
  *  \see \p set_difference
@@ -852,17 +850,17 @@ template<typename InputIterator1,
  *  \return The end of the output range.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator2's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *
  *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to \p comp.
  *  \pre The resulting range shall not overlap with either input range.
@@ -875,16 +873,16 @@ template<typename InputIterator1,
  *  #include <thrust/set_operations.h>
  *  #include <thrust/execution_policy.h>
  *  ...
- *  int A1[6] = {7, 6, 4, 2, 2, 1, 0};
+ *  int A1[7] = {7, 6, 4, 2, 2, 1, 0};
  *  int A2[5] = {8, 5, 2, 1, 1};
  *
  *  int result[6];
  *
- *  int *result_end = thrust::set_symmetric_difference(thrust::host, A1, A1 + 6, A2, A2 + 5, result);
+ *  int *result_end = thrust::set_symmetric_difference(thrust::host, A1, A1 + 7, A2, A2 + 5, result);
  *  // result = {8, 7, 6, 5, 4, 0}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/set_symmetric_difference.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/set_symmetric_difference
  *  \see \p merge
  *  \see \p includes
  *  \see \p set_difference
@@ -934,17 +932,17 @@ __host__ __device__
  *  \param comp Comparison operator.
  *  \return The end of the output range.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator2's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *
  *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to \p comp.
  *  \pre The resulting range shall not overlap with either input range.
@@ -955,16 +953,16 @@ __host__ __device__
  *  \code
  *  #include <thrust/set_operations.h>
  *  ...
- *  int A1[6] = {7, 6, 4, 2, 2, 1, 0};
+ *  int A1[7] = {7, 6, 4, 2, 2, 1, 0};
  *  int A2[5] = {8, 5, 2, 1, 1};
  *
  *  int result[6];
  *
- *  int *result_end = thrust::set_symmetric_difference(A1, A1 + 6, A2, A2 + 5, result);
+ *  int *result_end = thrust::set_symmetric_difference(A1, A1 + 7, A2, A2 + 5, result);
  *  // result = {8, 7, 6, 5, 4, 0}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/set_symmetric_difference.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/set_symmetric_difference
  *  \see \p merge
  *  \see \p includes
  *  \see \p set_difference
@@ -1012,17 +1010,17 @@ template<typename InputIterator1,
  *  \return The end of the output range.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator2's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *
  *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
  *  \pre The resulting range shall not overlap with either input range.
@@ -1044,7 +1042,7 @@ template<typename InputIterator1,
  *  // result = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/set_union.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/set_union
  *  \see \p merge
  *  \see \p includes
  *  \see \p set_union
@@ -1089,17 +1087,17 @@ __host__ __device__
  *  \param result The beginning of the output range.
  *  \return The end of the output range.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator2's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *
  *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
  *  \pre The resulting range shall not overlap with either input range.
@@ -1119,7 +1117,7 @@ __host__ __device__
  *  // result = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/set_union.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/set_union
  *  \see \p merge
  *  \see \p includes
  *  \see \p set_union
@@ -1166,14 +1164,14 @@ template<typename InputIterator1,
  *  \return The end of the output range.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1's \c value_type is convertable to \p StrictWeakCompare's \c first_argument_type.
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2's \c value_type is convertable to \p StrictWeakCompare's \c second_argument_type.
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam StrictWeakCompare is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to \p comp.
  *  \pre The resulting range shall not overlap with either input range.
@@ -1196,7 +1194,7 @@ template<typename InputIterator1,
  *  // result = {12, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/set_union.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/set_union
  *  \see \p merge
  *  \see \p includes
  *  \see \p set_union
@@ -1244,14 +1242,14 @@ __host__ __device__
  *  \param comp Comparison operator.
  *  \return The end of the output range.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1's \c value_type is convertable to \p StrictWeakCompare's \c first_argument_type.
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2's \c value_type is convertable to \p StrictWeakCompare's \c second_argument_type.
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam StrictWeakCompare is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to \p comp.
  *  \pre The resulting range shall not overlap with either input range.
@@ -1272,7 +1270,7 @@ __host__ __device__
  *  // result = {12, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/set_union.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/set_union
  *  \see \p merge
  *  \see \p includes
  *  \see \p set_union
@@ -1330,22 +1328,22 @@ template<typename InputIterator1,
  *          and such that <tt>p.second</tt> is the end of the output range of values.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator2's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator3 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator4 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *
  *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
  *  \pre The resulting ranges shall not overlap with any input range.
@@ -1431,22 +1429,22 @@ __host__ __device__
  *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
  *          and such that <tt>p.second</tt> is the end of the output range of values.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator2's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator3 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator4 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *
  *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
  *  \pre The resulting ranges shall not overlap with any input range.
@@ -1532,23 +1530,23 @@ template<typename InputIterator1,
  *          and such that <tt>p.second</tt> is the end of the output range of values.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator2's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator3 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator4 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam StrictWeakCompare is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to \p comp.
  *  \pre The resulting ranges shall not overlap with any input range.
@@ -1638,23 +1636,23 @@ __host__ __device__
  *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
  *          and such that <tt>p.second</tt> is the end of the output range of values.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator2's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator3 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator4 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam StrictWeakCompare is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to \p comp.
  *  \pre The resulting ranges shall not overlap with any input range.
@@ -1745,20 +1743,20 @@ template<typename InputIterator1,
  *        \c values_first2 parameter because elements from the second input range are never copied to the output range.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator2's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator3 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *
  *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
  *  \pre The resulting ranges shall not overlap with any input range.
@@ -1845,20 +1843,20 @@ __host__ __device__
  *  \note Unlike the other key-value set operations, \p set_intersection_by_key is unique in that it has no
  *        \c values_first2 parameter because elements from the second input range are never copied to the output range.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator2's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator3 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *
  *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
  *  \pre The resulting ranges shall not overlap with any input range.
@@ -1945,21 +1943,21 @@ template<typename InputIterator1,
  *        \c values_first2 parameter because elements from the second input range are never copied to the output range.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator2's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator3 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam StrictWeakCompare is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to \p comp.
  *  \pre The resulting ranges shall not overlap with any input range.
@@ -2050,21 +2048,21 @@ __host__ __device__
  *  \note Unlike the other key-value set operations, \p set_intersection_by_key is unique in that it has no
  *        \c values_first2 parameter because elements from the second input range are never copied to the output range.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator2's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator3 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam StrictWeakCompare is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to \p comp.
  *  \pre The resulting ranges shall not overlap with any input range.
@@ -2153,22 +2151,22 @@ template<typename InputIterator1,
  *          and such that <tt>p.second</tt> is the end of the output range of values.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator2's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator3 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator4 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *
  *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
  *  \pre The resulting ranges shall not overlap with any input range.
@@ -2257,22 +2255,22 @@ __host__ __device__
  *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
  *          and such that <tt>p.second</tt> is the end of the output range of values.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator2's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator3 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator4 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *
  *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
  *  \pre The resulting ranges shall not overlap with any input range.
@@ -2361,23 +2359,23 @@ template<typename InputIterator1,
  *          and such that <tt>p.second</tt> is the end of the output range of values.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator2's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator3 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator4 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam StrictWeakCompare is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to \p comp.
  *  \pre The resulting ranges shall not overlap with any input range.
@@ -2470,23 +2468,23 @@ __host__ __device__
  *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
  *          and such that <tt>p.second</tt> is the end of the output range of values.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator2's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator3 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator4 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam StrictWeakCompare is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to \p comp.
  *  \pre The resulting ranges shall not overlap with any input range.
@@ -2575,22 +2573,22 @@ template<typename InputIterator1,
  *          and such that <tt>p.second</tt> is the end of the output range of values.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator2's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator3 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator4 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *
  *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
  *  \pre The resulting ranges shall not overlap with any input range.
@@ -2677,22 +2675,22 @@ __host__ __device__
  *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
  *          and such that <tt>p.second</tt> is the end of the output range of values.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator2's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator3 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator4 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *
  *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
  *  \pre The resulting ranges shall not overlap with any input range.
@@ -2779,23 +2777,23 @@ template<typename InputIterator1,
  *          and such that <tt>p.second</tt> is the end of the output range of values.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator2's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator3 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator4 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam StrictWeakCompare is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to \p comp.
  *  \pre The resulting ranges shall not overlap with any input range.
@@ -2886,23 +2884,23 @@ __host__ __device__
  *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
  *          and such that <tt>p.second</tt> is the end of the output range of values.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator2's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator3 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator4 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam StrictWeakCompare is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to \p comp.
  *  \pre The resulting ranges shall not overlap with any input range.
@@ -2956,8 +2954,6 @@ template<typename InputIterator1,
 /*! \} // end set_operations
  */
 
-
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/set_operations.inl>
-
diff --git a/thrust/shuffle.h b/thrust/shuffle.h
new file mode 100644
index 000000000..d95327e29
--- /dev/null
+++ b/thrust/shuffle.h
@@ -0,0 +1,179 @@
+/*
+ *  Copyright 2008-2020 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file shuffle.h
+ *  \brief Reorders range by a uniform random permutation
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp11_required.h>
+
+#if THRUST_CPP_DIALECT >= 2011
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+
+THRUST_NAMESPACE_BEGIN
+
+/*! \addtogroup reordering
+*  \ingroup algorithms
+*
+*  \addtogroup shuffling
+*  \ingroup reordering
+*  \{
+*/
+
+
+/*! \p shuffle reorders the elements <tt>[first, last)</tt> by a uniform pseudorandom permutation, defined by
+ *  random engine \p g.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence to shuffle.
+ *  \param last The end of the sequence to shuffle.
+ *  \param g A UniformRandomBitGenerator
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam RandomIterator is a random access iterator
+ *  \tparam URBG is a uniform random bit generator
+ *
+ *  The following code snippet demonstrates how to use \p shuffle to create a random permutation
+ *  using the \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/shuffle.h>
+ *  #include <thrust/random.h>
+ *  #include <thrust/execution_policy.h>
+ *  int A[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+ *  const int N = sizeof(A)/sizeof(int);
+ *  thrust::default_random_engine g;
+ *  thrust::shuffle(thrust::host, A, A + N, g);
+ *  // A is now {6, 5, 8, 7, 2, 1, 4, 3, 10, 9}
+ *  \endcode
+ *
+ *  \see \p shuffle_copy
+ */
+template <typename DerivedPolicy, typename RandomIterator, typename URBG>
+__host__ __device__ void shuffle(
+    const thrust::detail::execution_policy_base<DerivedPolicy>& exec,
+    RandomIterator first, RandomIterator last, URBG&& g);
+
+/*! \p shuffle reorders the elements <tt>[first, last)</tt> by a uniform pseudorandom permutation, defined by
+ *  random engine \p g.
+ *
+ *  \param first The beginning of the sequence to shuffle.
+ *  \param last The end of the sequence to shuffle.
+ *  \param g A UniformRandomBitGenerator
+ *
+ *  \tparam RandomIterator is a random access iterator
+ *  \tparam URBG is a uniform random bit generator
+ *
+ *  The following code snippet demonstrates how to use \p shuffle to create a random permutation.
+ *
+ *  \code
+ *  #include <thrust/shuffle.h>
+ *  #include <thrust/random.h>
+ *  int A[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+ *  const int N = sizeof(A)/sizeof(int);
+ *  thrust::default_random_engine g;
+ *  thrust::shuffle(A, A + N, g);
+ *  // A is now {6, 5, 8, 7, 2, 1, 4, 3, 10, 9}
+ *  \endcode
+ *
+ *  \see \p shuffle_copy
+ */
+template <typename RandomIterator, typename URBG>
+__host__ __device__ void shuffle(RandomIterator first, RandomIterator last,
+                                 URBG&& g);
+
+/*! shuffle_copy differs from shuffle only in that the reordered sequence is written to different output sequences, rather than in place.
+ *  \p shuffle_copy reorders the elements <tt>[first, last)</tt> by a uniform pseudorandom permutation, defined by
+ *  random engine \p g.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence to shuffle.
+ *  \param last The end of the sequence to shuffle.
+ *  \param result Destination of shuffled sequence
+ *  \param g A UniformRandomBitGenerator
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam RandomIterator is a random access iterator
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam URBG is a uniform random bit generator
+ *
+ *  The following code snippet demonstrates how to use \p shuffle_copy to create a random permutation.
+ *
+ *  \code
+ *  #include <thrust/shuffle.h>
+ *  #include <thrust/random.h>
+ *  #include <thrust/execution_policy.h>
+ *  int A[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+ *  int result[10];
+ *  const int N = sizeof(A)/sizeof(int);
+ *  thrust::default_random_engine g;
+ *  thrust::shuffle_copy(thrust::host, A, A + N, result, g);
+ *  // result is now {6, 5, 8, 7, 2, 1, 4, 3, 10, 9}
+ *  \endcode
+ *
+ *  \see \p shuffle
+ */
+template <typename DerivedPolicy, typename RandomIterator,
+          typename OutputIterator, typename URBG>
+__host__ __device__ void shuffle_copy(
+    const thrust::detail::execution_policy_base<DerivedPolicy>& exec,
+    RandomIterator first, RandomIterator last, OutputIterator result, URBG&& g);
+
+/*! shuffle_copy differs from shuffle only in that the reordered sequence is written to different output sequences, rather than in place.
+ *\p shuffle_copy reorders the elements <tt>[first, last)</tt> by a uniform pseudorandom permutation, defined by
+ *  random engine \p g.
+ *
+ *  \param first The beginning of the sequence to shuffle.
+ *  \param last The end of the sequence to shuffle.
+ *  \param result Destination of shuffled sequence
+ *  \param g A UniformRandomBitGenerator
+ *
+ *  \tparam RandomIterator is a random access iterator
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam URBG is a uniform random bit generator
+ *
+ *  The following code snippet demonstrates how to use \p shuffle_copy to create a random permutation.
+ *
+ *  \code
+ *  #include <thrust/shuffle.h>
+ *  #include <thrust/random.h>
+ *  int A[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+ *  int result[10];
+ *  const int N = sizeof(A)/sizeof(int);
+ *  thrust::default_random_engine g;
+ *  thrust::shuffle_copy(A, A + N, result, g);
+ *  // result is now {6, 5, 8, 7, 2, 1, 4, 3, 10, 9}
+ *  \endcode
+ *
+ *  \see \p shuffle
+ */
+template <typename RandomIterator, typename OutputIterator, typename URBG>
+__host__ __device__ void shuffle_copy(RandomIterator first, RandomIterator last,
+                                      OutputIterator result, URBG&& g);
+
+THRUST_NAMESPACE_END
+
+#include <thrust/detail/shuffle.inl>
+#endif
diff --git a/thrust/sort.h b/thrust/sort.h
index a100f9602..5cf9d6217 100644
--- a/thrust/sort.h
+++ b/thrust/sort.h
@@ -24,9 +24,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/detail/execution_policy.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup sorting
  *  \ingroup algorithms
@@ -51,11 +49,11 @@ namespace thrust
  *  \param last The end of the sequence.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam RandomAccessIterator is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
+ *  \tparam RandomAccessIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/random_access_iterator">Random Access Iterator</a>,
  *          \p RandomAccessIterator is mutable,
- *          and \p RandomAccessIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>,
+ *          and \p RandomAccessIterator's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
  *          and the ordering relation on \p RandomAccessIterator's \c value_type is a <em>strict weak ordering</em>, as defined in the
- *          <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a> requirements.
+ *          <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements.
  *
  *  The following code snippet demonstrates how to use \p sort to sort
  *  a sequence of integers using the \p thrust::host execution policy for parallelization:
@@ -70,7 +68,7 @@ namespace thrust
  *  // A is now {1, 2, 4, 5, 7, 8}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/sort.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/sort
  *  \see \p stable_sort
  *  \see \p sort_by_key
  */
@@ -94,11 +92,11 @@ __host__ __device__
  *  \param first The beginning of the sequence.
  *  \param last The end of the sequence.
  *
- *  \tparam RandomAccessIterator is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
+ *  \tparam RandomAccessIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/random_access_iterator">Random Access Iterator</a>,
  *          \p RandomAccessIterator is mutable,
- *          and \p RandomAccessIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>,
+ *          and \p RandomAccessIterator's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
  *          and the ordering relation on \p RandomAccessIterator's \c value_type is a <em>strict weak ordering</em>, as defined in the
- *          <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a> requirements.
+ *          <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements.
  *
  *  The following code snippet demonstrates how to use \p sort to sort
  *  a sequence of integers.
@@ -112,7 +110,7 @@ __host__ __device__
  *  // A is now {1, 2, 4, 5, 7, 8}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/sort.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/sort
  *  \see \p stable_sort
  *  \see \p sort_by_key
  */
@@ -140,11 +138,11 @@ template<typename RandomAccessIterator>
  *  \param comp  Comparison operator.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam RandomAccessIterator is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
+ *  \tparam RandomAccessIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/random_access_iterator">Random Access Iterator</a>,
  *          \p RandomAccessIterator is mutable,
  *          and \p RandomAccessIterator's \c value_type is convertible to \p StrictWeakOrdering's
  *          \c first_argument_type and \c second_argument_type.
- *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam StrictWeakOrdering is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  The following code demonstrates how to sort integers in descending order
  *  using the greater<int> comparison operator using the \p thrust::host execution policy for parallelization:
@@ -160,7 +158,7 @@ template<typename RandomAccessIterator>
  *  // A is now {8, 7, 5, 4, 2, 1};
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/sort.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/sort
  *  \see \p stable_sort
  *  \see \p sort_by_key
  */
@@ -189,11 +187,11 @@ __host__ __device__
  *  \param last The end of the sequence.
  *  \param comp  Comparison operator.
  *
- *  \tparam RandomAccessIterator is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
+ *  \tparam RandomAccessIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/random_access_iterator">Random Access Iterator</a>,
  *          \p RandomAccessIterator is mutable,
  *          and \p RandomAccessIterator's \c value_type is convertible to \p StrictWeakOrdering's
  *          \c first_argument_type and \c second_argument_type.
- *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam StrictWeakOrdering is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  The following code demonstrates how to sort integers in descending order
  *  using the greater<int> comparison operator.
@@ -208,7 +206,7 @@ __host__ __device__
  *  // A is now {8, 7, 5, 4, 2, 1};
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/sort.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/sort
  *  \see \p stable_sort
  *  \see \p sort_by_key
  */
@@ -241,11 +239,11 @@ __host__ __device__
  *  \param last The end of the sequence.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam RandomAccessIterator is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
+ *  \tparam RandomAccessIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/random_access_iterator">Random Access Iterator</a>,
  *          \p RandomAccessIterator is mutable,
- *          and \p RandomAccessIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>,
+ *          and \p RandomAccessIterator's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
  *          and the ordering relation on \p RandomAccessIterator's \c value_type is a <em>strict weak ordering</em>, as defined in the
- *          <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a> requirements.
+ *          <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements.
  *
  *  The following code snippet demonstrates how to use \p sort to sort
  *  a sequence of integers using the \p thrust::host execution policy for parallelization:
@@ -260,7 +258,7 @@ __host__ __device__
  *  // A is now {1, 2, 4, 5, 7, 8}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/stable_sort.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/stable_sort
  *  \see \p sort
  *  \see \p stable_sort_by_key
  */
@@ -288,11 +286,11 @@ __host__ __device__
  *  \param first The beginning of the sequence.
  *  \param last The end of the sequence.
  *
- *  \tparam RandomAccessIterator is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
+ *  \tparam RandomAccessIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/random_access_iterator">Random Access Iterator</a>,
  *          \p RandomAccessIterator is mutable,
- *          and \p RandomAccessIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>,
+ *          and \p RandomAccessIterator's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
  *          and the ordering relation on \p RandomAccessIterator's \c value_type is a <em>strict weak ordering</em>, as defined in the
- *          <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a> requirements.
+ *          <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements.
  *
  *  The following code snippet demonstrates how to use \p sort to sort
  *  a sequence of integers.
@@ -306,7 +304,7 @@ __host__ __device__
  *  // A is now {1, 2, 4, 5, 7, 8}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/stable_sort.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/stable_sort
  *  \see \p sort
  *  \see \p stable_sort_by_key
  */
@@ -338,11 +336,11 @@ template<typename RandomAccessIterator>
  *  \param comp Comparison operator.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam RandomAccessIterator is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
+ *  \tparam RandomAccessIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/random_access_iterator">Random Access Iterator</a>,
  *          \p RandomAccessIterator is mutable,
  *          and \p RandomAccessIterator's \c value_type is convertible to \p StrictWeakOrdering's
  *          \c first_argument_type and \c second_argument_type.
- *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam StrictWeakOrdering is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  The following code demonstrates how to sort integers in descending order
  *  using the greater<int> comparison operator using the \p thrust::host execution policy for parallelization:
@@ -358,7 +356,7 @@ template<typename RandomAccessIterator>
  *  // A is now {8, 7, 5, 4, 2, 1};
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/stable_sort.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/stable_sort
  *  \see \p sort
  *  \see \p stable_sort_by_key
  */
@@ -391,11 +389,11 @@ __host__ __device__
  *  \param last The end of the sequence.
  *  \param comp Comparison operator.
  *
- *  \tparam RandomAccessIterator is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
+ *  \tparam RandomAccessIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/random_access_iterator">Random Access Iterator</a>,
  *          \p RandomAccessIterator is mutable,
  *          and \p RandomAccessIterator's \c value_type is convertible to \p StrictWeakOrdering's
  *          \c first_argument_type and \c second_argument_type.
- *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam StrictWeakOrdering is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  The following code demonstrates how to sort integers in descending order
  *  using the greater<int> comparison operator.
@@ -410,7 +408,7 @@ __host__ __device__
  *  // A is now {8, 7, 5, 4, 2, 1};
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/stable_sort.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/stable_sort
  *  \see \p sort
  *  \see \p stable_sort_by_key
  */
@@ -450,12 +448,12 @@ template<typename RandomAccessIterator,
  *  \param values_first The beginning of the value sequence.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam RandomAccessIterator1 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
+ *  \tparam RandomAccessIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/random_access_iterator">Random Access Iterator</a>,
  *          \p RandomAccessIterator1 is mutable,
- *          and \p RandomAccessIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>,
+ *          and \p RandomAccessIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
  *          and the ordering relation on \p RandomAccessIterator1's \c value_type is a <em>strict weak ordering</em>, as defined in the
- *          <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a> requirements.
- *  \tparam RandomAccessIterator2 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.hml">Random Access Iterator</a>,
+ *          <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements.
+ *  \tparam RandomAccessIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/named_req/RandomAccessIterator">Random Access Iterator</a>,
  *          and \p RandomAccessIterator2 is mutable.
  *
  *  \pre The range <tt>[keys_first, keys_last))</tt> shall not overlap the range <tt>[values_first, values_first + (keys_last - keys_first))</tt>.
@@ -476,7 +474,7 @@ template<typename RandomAccessIterator,
  *  // values is now {'a', 'c', 'b', 'e', 'f', 'd'}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/sort.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/sort
  *  \see \p stable_sort_by_key
  *  \see \p sort
  */
@@ -510,12 +508,12 @@ __host__ __device__
  *  \param keys_last The end of the key sequence.
  *  \param values_first The beginning of the value sequence.
  *
- *  \tparam RandomAccessIterator1 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
+ *  \tparam RandomAccessIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/random_access_iterator">Random Access Iterator</a>,
  *          \p RandomAccessIterator1 is mutable,
- *          and \p RandomAccessIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>,
+ *          and \p RandomAccessIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
  *          and the ordering relation on \p RandomAccessIterator1's \c value_type is a <em>strict weak ordering</em>, as defined in the
- *          <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a> requirements.
- *  \tparam RandomAccessIterator2 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.hml">Random Access Iterator</a>,
+ *          <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements.
+ *  \tparam RandomAccessIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/named_req/RandomAccessIterator">Random Access Iterator</a>,
  *          and \p RandomAccessIterator2 is mutable.
  *
  *  \pre The range <tt>[keys_first, keys_last))</tt> shall not overlap the range <tt>[values_first, values_first + (keys_last - keys_first))</tt>.
@@ -534,7 +532,7 @@ __host__ __device__
  *  // values is now {'a', 'c', 'b', 'e', 'f', 'd'}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/sort.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/sort
  *  \see \p stable_sort_by_key
  *  \see \p sort
  */
@@ -571,13 +569,13 @@ template<typename RandomAccessIterator1,
  *  \param comp Comparison operator.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam RandomAccessIterator1 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
+ *  \tparam RandomAccessIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/random_access_iterator">Random Access Iterator</a>,
  *          \p RandomAccessIterator1 is mutable,
  *          and \p RandomAccessIterator1's \c value_type is convertible to \p StrictWeakOrdering's
  *          \c first_argument_type and \c second_argument_type.
- *  \tparam RandomAccessIterator2 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.hml">Random Access Iterator</a>,
+ *  \tparam RandomAccessIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/named_req/RandomAccessIterator">Random Access Iterator</a>,
  *          and \p RandomAccessIterator2 is mutable.
- *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam StrictWeakOrdering is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  \pre The range <tt>[keys_first, keys_last))</tt> shall not overlap the range <tt>[values_first, values_first + (keys_last - keys_first))</tt>.
  *
@@ -597,7 +595,7 @@ template<typename RandomAccessIterator1,
  *  // values is now {'d', 'f', 'e', 'b', 'c', 'a'}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/sort.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/sort
  *  \see \p stable_sort_by_key
  *  \see \p sort
  */
@@ -635,13 +633,13 @@ __host__ __device__
  *  \param values_first The beginning of the value sequence.
  *  \param comp Comparison operator.
  *
- *  \tparam RandomAccessIterator1 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
+ *  \tparam RandomAccessIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/random_access_iterator">Random Access Iterator</a>,
  *          \p RandomAccessIterator1 is mutable,
  *          and \p RandomAccessIterator1's \c value_type is convertible to \p StrictWeakOrdering's
  *          \c first_argument_type and \c second_argument_type.
- *  \tparam RandomAccessIterator2 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.hml">Random Access Iterator</a>,
+ *  \tparam RandomAccessIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/named_req/RandomAccessIterator">Random Access Iterator</a>,
  *          and \p RandomAccessIterator2 is mutable.
- *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam StrictWeakOrdering is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  \pre The range <tt>[keys_first, keys_last))</tt> shall not overlap the range <tt>[values_first, values_first + (keys_last - keys_first))</tt>.
  *
@@ -660,7 +658,7 @@ __host__ __device__
  *  // values is now {'d', 'f', 'e', 'b', 'c', 'a'}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/sort.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/sort
  *  \see \p stable_sort_by_key
  *  \see \p sort
  */
@@ -699,12 +697,12 @@ template<typename RandomAccessIterator1,
  *  \param values_first The beginning of the value sequence.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam RandomAccessIterator1 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
+ *  \tparam RandomAccessIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/random_access_iterator">Random Access Iterator</a>,
  *          \p RandomAccessIterator1 is mutable,
- *          and \p RandomAccessIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>,
+ *          and \p RandomAccessIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
  *          and the ordering relation on \p RandomAccessIterator1's \c value_type is a <em>strict weak ordering</em>, as defined in the
- *          <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a> requirements.
- *  \tparam RandomAccessIterator2 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.hml">Random Access Iterator</a>,
+ *          <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements.
+ *  \tparam RandomAccessIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/named_req/RandomAccessIterator">Random Access Iterator</a>,
  *          and \p RandomAccessIterator2 is mutable.
  *
  *  \pre The range <tt>[keys_first, keys_last))</tt> shall not overlap the range <tt>[values_first, values_first + (keys_last - keys_first))</tt>.
@@ -725,7 +723,7 @@ template<typename RandomAccessIterator1,
  *  // values is now {'a', 'c', 'b', 'e', 'f', 'd'}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/sort.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/sort
  *  \see \p sort_by_key
  *  \see \p stable_sort
  */
@@ -761,12 +759,12 @@ __host__ __device__
  *  \param keys_last The end of the key sequence.
  *  \param values_first The beginning of the value sequence.
  *
- *  \tparam RandomAccessIterator1 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
+ *  \tparam RandomAccessIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/random_access_iterator">Random Access Iterator</a>,
  *          \p RandomAccessIterator1 is mutable,
- *          and \p RandomAccessIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>,
+ *          and \p RandomAccessIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
  *          and the ordering relation on \p RandomAccessIterator1's \c value_type is a <em>strict weak ordering</em>, as defined in the
- *          <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a> requirements.
- *  \tparam RandomAccessIterator2 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.hml">Random Access Iterator</a>,
+ *          <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements.
+ *  \tparam RandomAccessIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/named_req/RandomAccessIterator">Random Access Iterator</a>,
  *          and \p RandomAccessIterator2 is mutable.
  *
  *  \pre The range <tt>[keys_first, keys_last))</tt> shall not overlap the range <tt>[values_first, values_first + (keys_last - keys_first))</tt>.
@@ -785,7 +783,7 @@ __host__ __device__
  *  // values is now {'a', 'c', 'b', 'e', 'f', 'd'}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/sort.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/sort
  *  \see \p sort_by_key
  *  \see \p stable_sort
  */
@@ -824,13 +822,13 @@ template<typename RandomAccessIterator1,
  *  \param comp Comparison operator.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam RandomAccessIterator1 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
+ *  \tparam RandomAccessIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/random_access_iterator">Random Access Iterator</a>,
  *          \p RandomAccessIterator1 is mutable,
  *          and \p RandomAccessIterator1's \c value_type is convertible to \p StrictWeakOrdering's
  *          \c first_argument_type and \c second_argument_type.
- *  \tparam RandomAccessIterator2 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.hml">Random Access Iterator</a>,
+ *  \tparam RandomAccessIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/named_req/RandomAccessIterator">Random Access Iterator</a>,
  *          and \p RandomAccessIterator2 is mutable.
- *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam StrictWeakOrdering is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  \pre The range <tt>[keys_first, keys_last))</tt> shall not overlap the range <tt>[values_first, values_first + (keys_last - keys_first))</tt>.
  *
@@ -851,7 +849,7 @@ template<typename RandomAccessIterator1,
  *  \endcode
  *
  *
- *  \see http://www.sgi.com/tech/stl/sort.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/sort
  *  \see \p sort_by_key
  *  \see \p stable_sort
  */
@@ -891,13 +889,13 @@ __host__ __device__
  *  \param values_first The beginning of the value sequence.
  *  \param comp Comparison operator.
  *
- *  \tparam RandomAccessIterator1 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
+ *  \tparam RandomAccessIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/random_access_iterator">Random Access Iterator</a>,
  *          \p RandomAccessIterator1 is mutable,
  *          and \p RandomAccessIterator1's \c value_type is convertible to \p StrictWeakOrdering's
  *          \c first_argument_type and \c second_argument_type.
- *  \tparam RandomAccessIterator2 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.hml">Random Access Iterator</a>,
+ *  \tparam RandomAccessIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/named_req/RandomAccessIterator">Random Access Iterator</a>,
  *          and \p RandomAccessIterator2 is mutable.
- *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam StrictWeakOrdering is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  \pre The range <tt>[keys_first, keys_last))</tt> shall not overlap the range <tt>[values_first, values_first + (keys_last - keys_first))</tt>.
  *
@@ -917,7 +915,7 @@ __host__ __device__
  *  \endcode
  *
  *
- *  \see http://www.sgi.com/tech/stl/sort.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/sort
  *  \see \p sort_by_key
  *  \see \p stable_sort
  */
@@ -956,10 +954,10 @@ template<typename RandomAccessIterator1,
  *  \return \c true, if the sequence is sorted; \c false, otherwise.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
- *          \p ForwardIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
+ *          \p ForwardIterator's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
  *          and the ordering on objects of \p ForwardIterator's \c value_type is a <em>strict weak ordering</em>, as defined
- *          in the <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a> requirements.
+ *          in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements.
  *
  *
  *  The following code demonstrates how to use \p is_sorted to test whether the
@@ -990,7 +988,7 @@ template<typename RandomAccessIterator1,
  *  // result == true
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/is_sorted.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/is_sorted
  *  \see is_sorted_until
  *  \see \c sort
  *  \see \c stable_sort
@@ -1014,10 +1012,10 @@ __host__ __device__
  *  \param last  The end of the sequence.
  *  \return \c true, if the sequence is sorted; \c false, otherwise.
  *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
- *          \p ForwardIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
+ *          \p ForwardIterator's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
  *          and the ordering on objects of \p ForwardIterator's \c value_type is a <em>strict weak ordering</em>, as defined
- *          in the <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a> requirements.
+ *          in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements.
  *
  *
  *  The following code demonstrates how to use \p is_sorted to test whether the
@@ -1046,7 +1044,7 @@ __host__ __device__
  *  // result == true
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/is_sorted.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/is_sorted
  *  \see is_sorted_until
  *  \see \c sort
  *  \see \c stable_sort
@@ -1072,10 +1070,10 @@ template<typename ForwardIterator>
  *  \return \c true, if the sequence is sorted according to comp; \c false, otherwise.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator's \c value_type is convertible to both \c StrictWeakOrdering's \c first_argument_type
  *          and \c second_argument_type.
- *  \tparam Compare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam Compare is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  The following code snippet demonstrates how to use \p is_sorted to test whether the
  *  contents of a \c device_vector are stored in descending order using the \p thrust::device execution
@@ -1106,7 +1104,7 @@ template<typename ForwardIterator>
  *  // result == true
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/is_sorted.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/is_sorted
  *  \see \c sort
  *  \see \c stable_sort
  *  \see \c less<T>
@@ -1130,10 +1128,10 @@ __host__ __device__
  *  \param comp  Comparison operator.
  *  \return \c true, if the sequence is sorted according to comp; \c false, otherwise.
  *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator's \c value_type is convertible to both \c StrictWeakOrdering's \c first_argument_type
  *          and \c second_argument_type.
- *  \tparam Compare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam Compare is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  The following code snippet demonstrates how to use \p is_sorted to test whether the
  *  contents of a \c device_vector are stored in descending order.
@@ -1162,7 +1160,7 @@ __host__ __device__
  *  // result == true
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/is_sorted.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/is_sorted
  *  \see \c sort
  *  \see \c stable_sort
  *  \see \c less<T>
@@ -1185,8 +1183,8 @@ template<typename ForwardIterator, typename Compare>
  *  \return The last iterator in the input range for which it is sorted.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a> and
- *          \p ForwardIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a> and
+ *          \p ForwardIterator's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>.
  *
  *  The following code snippet demonstrates how to use \p is_sorted_until to find the first position
  *  in an array where the data becomes unsorted using the \p thrust::host execution policy for
@@ -1227,8 +1225,8 @@ __host__ __device__
  *  \param last The end of the range of interest.
  *  \return The last iterator in the input range for which it is sorted.
  *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a> and
- *          \p ForwardIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a> and
+ *          \p ForwardIterator's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>.
  *
  *  The following code snippet demonstrates how to use \p is_sorted_until to find the first position
  *  in an array where the data becomes unsorted:
@@ -1270,9 +1268,9 @@ template<typename ForwardIterator>
  *  \return The last iterator in the input range for which it is sorted.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a> and
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a> and
  *          \p ForwardIterator's \c value_type is convertible to \p Compare's \c argument_type.
- *  \tparam Compare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam Compare is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  The following code snippet demonstrates how to use \p is_sorted_until to find the first position
  *  in an array where the data becomes unsorted in descending order using the \p thrust::host execution
@@ -1317,9 +1315,9 @@ __host__ __device__
  *  \param comp The function object to use for comparison.
  *  \return The last iterator in the input range for which it is sorted.
  *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a> and
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a> and
  *          \p ForwardIterator's \c value_type is convertible to \p Compare's \c argument_type.
- *  \tparam Compare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam Compare is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  The following code snippet demonstrates how to use \p is_sorted_until to find the first position
  *  in an array where the data becomes unsorted in descending order:
@@ -1355,8 +1353,6 @@ template<typename ForwardIterator, typename Compare>
  *  \} // end reductions
  */
 
-
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/sort.inl>
-
diff --git a/thrust/swap.h b/thrust/swap.h
index 246e84387..d8a8be73c 100644
--- a/thrust/swap.h
+++ b/thrust/swap.h
@@ -23,12 +23,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/detail/execution_policy.h>
 
-// empty Doxygen comment below so namespace thrust's documentation will be extracted
-
-/*!
- */
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup utility
  *  \{
@@ -47,7 +42,7 @@ namespace thrust
  *  \param b The second value of interest. After completion,
  *           the value of a will be returned here.
  *
- *  \tparam Assignable is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>.
+ *  \tparam Assignable is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>.
  *
  *  The following code snippet demonstrates how to use \p swap to
  *  swap the contents of two variables.
@@ -94,9 +89,9 @@ inline void swap(Assignable1 &a, Assignable2 &b);
  *          sequence to swap.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator1 is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator1's \c value_type must be convertible to \p ForwardIterator2's \c value_type.
- *  \tparam ForwardIterator2 is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator2's \c value_type must be convertible to \p ForwardIterator1's \c value_type.
  *
  *  \pre \p first1 may equal \p first2, but the range <tt>[first1, last1)</tt> shall not overlap the range <tt>[first2, first2 + (last1 - first1))</tt> otherwise.
@@ -121,7 +116,7 @@ inline void swap(Assignable1 &a, Assignable2 &b);
  *  // v1[0] == 3, v1[1] == 4, v2[0] == 1, v2[1] == 2
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/swap_ranges.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/swap_ranges
  *  \see \c swap
  */
 template<typename DerivedPolicy,
@@ -146,9 +141,9 @@ __host__ __device__
  *  \return An iterator pointing to one position past the last element of the second
  *          sequence to swap.
  *
- *  \tparam ForwardIterator1 is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator1's \c value_type must be convertible to \p ForwardIterator2's \c value_type.
- *  \tparam ForwardIterator2 is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator2's \c value_type must be convertible to \p ForwardIterator1's \c value_type.
  *
  *  \pre \p first1 may equal \p first2, but the range <tt>[first1, last1)</tt> shall not overlap the range <tt>[first2, first2 + (last1 - first1))</tt> otherwise.
@@ -171,7 +166,7 @@ __host__ __device__
  *  // v1[0] == 3, v1[1] == 4, v2[0] == 1, v2[1] == 2
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/swap_ranges.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/swap_ranges
  *  \see \c swap
  */
 template<typename ForwardIterator1,
@@ -184,8 +179,6 @@ template<typename ForwardIterator1,
 /*! \} // copying
  */
 
-
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/swap.inl>
-
diff --git a/thrust/system/cpp/detail/execution_policy.h b/thrust/system/cpp/detail/execution_policy.h
index 27e4db862..1a8193bf3 100644
--- a/thrust/system/cpp/detail/execution_policy.h
+++ b/thrust/system/cpp/detail/execution_policy.h
@@ -19,8 +19,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/sequential/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 // put the canonical tag in the same ns as the backend's entry points
@@ -77,5 +76,5 @@ using thrust::system::cpp::execution_policy;
 using thrust::system::cpp::tag;
 
 } // end cpp
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/cpp/detail/memory.inl b/thrust/system/cpp/detail/memory.inl
index bbb0bab78..650aa1cb5 100644
--- a/thrust/system/cpp/detail/memory.inl
+++ b/thrust/system/cpp/detail/memory.inl
@@ -14,13 +14,14 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/system/cpp/memory.h>
 #include <thrust/system/cpp/detail/malloc_and_free.h>
 #include <limits>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace cpp
@@ -47,5 +48,5 @@ void free(pointer<void> ptr)
 
 } // end cpp
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/cpp/detail/par.h b/thrust/system/cpp/detail/par.h
index d721799d7..c56921327 100644
--- a/thrust/system/cpp/detail/par.h
+++ b/thrust/system/cpp/detail/par.h
@@ -20,8 +20,7 @@
 #include <thrust/detail/allocator_aware_execution_policy.h>
 #include <thrust/system/cpp/detail/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace cpp
@@ -35,14 +34,14 @@ struct par_t : thrust::system::cpp::detail::execution_policy<par_t>,
     thrust::system::cpp::detail::execution_policy>
 {
   __host__ __device__
-  par_t() : thrust::system::cpp::detail::execution_policy<par_t>() {}
+  constexpr par_t() : thrust::system::cpp::detail::execution_policy<par_t>() {}
 };
 
 
 } // end detail
 
 
-static const detail::par_t par;
+THRUST_INLINE_CONSTANT detail::par_t par;
 
 
 } // end cpp
@@ -58,5 +57,5 @@ using thrust::system::cpp::par;
 
 
 } // end cpp
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/cpp/detail/pointer.inl b/thrust/system/cpp/detail/pointer.inl
deleted file mode 100644
index 7d9de3e55..000000000
--- a/thrust/system/cpp/detail/pointer.inl
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- *  Copyright 2008-2018 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-namespace thrust
-{
-
-// XXX WAR an issue with MSVC 2005 (cl v14.00) incorrectly implementing
-//     pointer_raw_pointer for pointer by specializing it here
-#if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC) && (_MSC_VER <= 1400)
-namespace detail
-{
-
-template<typename T>
-  struct pointer_raw_pointer< thrust::cpp::pointer<T> >
-{
-  typedef typename thrust::cpp::pointer<T>::raw_pointer type;
-}; // end pointer_raw_pointer
-
-} // end detail
-#endif
-
-namespace system
-{
-namespace cpp
-{
-
-template<typename T>
-  template<typename OtherT>
-    reference<T> &
-      reference<T>
-        ::operator=(const reference<OtherT> &other)
-{
-  return super_t::operator=(other);
-} // end reference::operator=()
-
-template<typename T>
-  reference<T> &
-    reference<T>
-      ::operator=(const value_type &x)
-{
-  return super_t::operator=(x);
-} // end reference::operator=()
-
-template<typename T>
-__host__ __device__
-void swap(reference<T> a, reference<T> b)
-{
-  a.swap(b);
-} // end swap()
-
-} // end cpp
-} // end system
-} // end thrust
-
diff --git a/thrust/system/cpp/detail/vector.inl b/thrust/system/cpp/detail/vector.inl
index 77f8be3bc..02980c62a 100644
--- a/thrust/system/cpp/detail/vector.inl
+++ b/thrust/system/cpp/detail/vector.inl
@@ -20,8 +20,7 @@
 #include <thrust/system/cpp/vector.h>
 #include <utility>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace cpp
@@ -51,7 +50,7 @@ template<typename T, typename Allocator>
       : super_t(x)
 {}
 
-#if __cplusplus >= 201103L
+#if THRUST_CPP_DIALECT >= 2011
   template<typename T, typename Allocator>
     vector<T,Allocator>
       ::vector(vector &&x)
@@ -89,7 +88,7 @@ template<typename T, typename Allocator>
   return *this;
 }
 
-#if __cplusplus >= 201103L
+#if THRUST_CPP_DIALECT >= 2011
   template<typename T, typename Allocator>
     vector<T,Allocator> &
       vector<T,Allocator>
@@ -99,6 +98,27 @@ template<typename T, typename Allocator>
     return *this;
   }
 #endif
+  
+  template<typename T, typename Allocator>
+    vector<T,Allocator>
+      ::vector(std::initializer_list<T> il)
+        : super_t(il)
+  {}
+  
+  template<typename T, typename Allocator>
+    vector<T,Allocator>
+      ::vector(std::initializer_list<T> il, const Allocator& alloc)
+        : super_t(il, alloc)
+  {}
+
+  template<typename T, typename Allocator>
+    vector<T,Allocator> &
+      vector<T,Allocator>
+        ::operator=(std::initializer_list<T> il)
+  {
+    super_t::operator=(il);
+    return *this;
+  }
 
 template<typename T, typename Allocator>
   template<typename OtherT, typename OtherAllocator>
@@ -122,5 +142,5 @@ template<typename T, typename Allocator>
       
 } // end cpp
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/cpp/execution_policy.h b/thrust/system/cpp/execution_policy.h
index 3bf521be3..0d8a9a367 100644
--- a/thrust/system/cpp/execution_policy.h
+++ b/thrust/system/cpp/execution_policy.h
@@ -14,12 +14,12 @@
  *  limitations under the License.
  */
 
-#pragma once
-
 /*! \file thrust/system/cpp/execution_policy.h
- *  \brief Execution policies for Thrust's standard C++ system.
+ *  \brief Execution policies for Thrust's Standard C++ system.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 
 // get the execution policies definitions first
@@ -76,8 +76,7 @@
 // define these entities here for the purpose of Doxygenating them
 // they are actually defined elsewhere
 #if 0
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace cpp
@@ -104,7 +103,7 @@ struct execution_policy : thrust::execution_policy<DerivedPolicy>
 struct tag : thrust::system::cpp::execution_policy<tag> { unspecified };
 
 
-/*! 
+/*!
  *  \p thrust::system::cpp::par is the parallel execution policy associated with Thrust's standard
  *  C++ backend system.
  *
@@ -151,7 +150,7 @@ static const unspecified par;
 
 } // end cpp
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 #endif
 
 
diff --git a/thrust/system/cpp/memory.h b/thrust/system/cpp/memory.h
index 8eac91891..a18abeb8e 100644
--- a/thrust/system/cpp/memory.h
+++ b/thrust/system/cpp/memory.h
@@ -15,7 +15,7 @@
  */
 
 /*! \file thrust/system/cpp/memory.h
- *  \brief Managing memory associated with Thrust's standard C++ system.
+ *  \brief Managing memory associated with Thrust's Standard C++ system.
  */
 
 #pragma once
@@ -27,12 +27,10 @@
 #include <thrust/mr/allocator.h>
 #include <ostream>
 
-namespace thrust
-{
-namespace system
-{
-namespace cpp
+THRUST_NAMESPACE_BEGIN
+namespace system { namespace cpp
 {
+
 /*! Allocates an area of memory available to Thrust's <tt>cpp</tt> system.
  *  \param n Number of bytes to allocate.
  *  \return A <tt>cpp::pointer<void></tt> pointing to the beginning of the newly
@@ -66,83 +64,37 @@ inline pointer<T> malloc(std::size_t n);
  */
 inline void free(pointer<void> ptr);
 
-// XXX upon c++11
-// template<typename T>
-// using allocator = thrust::mr::stateless_resource_allocator<T, memory_resource>;
-
-/*! \p cpp::allocator is the default allocator used by the \p cpp system's containers such as
- *  <tt>cpp::vector</tt> if no user-specified allocator is provided. \p cpp::allocator allocates
- *  (deallocates) storage with \p cpp::malloc (\p cpp::free).
+/*! \p cpp::allocator is the default allocator used by the \p cpp system's
+ *  containers such as <tt>cpp::vector</tt> if no user-specified allocator is
+ *  provided. \p cpp::allocator allocates (deallocates) storage with \p
+ *  cpp::malloc (\p cpp::free).
  */
 template<typename T>
-  struct allocator
-    : thrust::mr::stateless_resource_allocator<
-        T,
-        memory_resource
-    >
-{
-private:
-    typedef thrust::mr::stateless_resource_allocator<
-        T,
-        memory_resource
-    > base;
-
-public:
-  /*! The \p rebind metafunction provides the type of an \p allocator
-   *  instantiated with another type.
-   *
-   *  \tparam U The other type to use for instantiation.
-   */
-  template<typename U>
-    struct rebind
-  {
-    /*! The typedef \p other gives the type of the rebound \p allocator.
-     */
-    typedef allocator<U> other;
-  };
-
-  /*! No-argument constructor has no effect.
-   */
-  __host__ __device__
-  inline allocator() {}
+using allocator = thrust::mr::stateless_resource_allocator<
+  T, thrust::system::cpp::memory_resource
+>;
 
-  /*! Copy constructor has no effect.
-   */
-  __host__ __device__
-  inline allocator(const allocator & other) : base(other) {}
-
-  /*! Constructor from other \p allocator has no effect.
-   */
-  template<typename U>
-  __host__ __device__
-  inline allocator(const allocator<U> & other) : base(other) {}
-
-  /*! Destructor has no effect.
-   */
-  __host__ __device__
-  inline ~allocator() {}
-}; // end allocator
-
-} // end cpp
-
-/*! \}
+/*! \p cpp::universal_allocator allocates memory that can be used by the \p cpp
+ *  system and host systems.
  */
+template<typename T>
+using universal_allocator = thrust::mr::stateless_resource_allocator<
+  T, thrust::system::cpp::universal_memory_resource
+>;
 
-} // end system
+}} // namespace system::cpp
 
 /*! \namespace thrust::cpp
  *  \brief \p thrust::cpp is a top-level alias for thrust::system::cpp.
  */
 namespace cpp
 {
-
 using thrust::system::cpp::malloc;
 using thrust::system::cpp::free;
 using thrust::system::cpp::allocator;
+} // namespace cpp
 
-} // end cpp
-
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/cpp/detail/memory.inl>
 
diff --git a/thrust/system/cpp/memory_resource.h b/thrust/system/cpp/memory_resource.h
index e89fd25fd..04b4e3cf8 100644
--- a/thrust/system/cpp/memory_resource.h
+++ b/thrust/system/cpp/memory_resource.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2018 NVIDIA Corporation
+ *  Copyright 2018-2020 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -15,7 +15,7 @@
  */
 
 /*! \file cpp/memory_resource.h
- *  \brief Memory resources for the CPP system.
+ *  \brief Memory resources for the Standard C++ system.
  */
 
 #pragma once
@@ -26,11 +26,8 @@
 
 #include <thrust/system/cpp/pointer.h>
 
-namespace thrust
-{
-namespace system
-{
-namespace cpp
+THRUST_NAMESPACE_BEGIN
+namespace system { namespace cpp
 {
 
 //! \cond
@@ -40,23 +37,35 @@ namespace detail
         thrust::mr::new_delete_resource,
         thrust::cpp::pointer<void>
     > native_resource;
-}
+
+    typedef thrust::mr::fancy_pointer_resource<
+        thrust::mr::new_delete_resource,
+        thrust::cpp::universal_pointer<void>
+    > universal_native_resource;
+} // namespace detail
 //! \endcond
 
 /*! \addtogroup memory_resources Memory Resources
- *  \ingroup memory_management_classes
+ *  \ingroup memory_management
+ *  \{
  */
 
-/*! The memory resource for the CPP system. Uses \p mr::new_delete_resource and tags it with \p cpp::pointer. */
+/*! The memory resource for the Standard C++ system. Uses \p
+ *  mr::new_delete_resource and tags it with \p cpp::pointer.
+ */
 typedef detail::native_resource memory_resource;
-/*! An alias for \p cpp::memory_resource. */
-typedef detail::native_resource universal_memory_resource;
-/*! An alias for \p cpp::memory_resource. */
+/*! The unified memory resource for the Standard C++ system. Uses
+ *  \p mr::new_delete_resource and tags it with \p cpp::universal_pointer.
+ */
+typedef detail::universal_native_resource universal_memory_resource;
+/*! An alias for \p cpp::universal_memory_resource. */
 typedef detail::native_resource universal_host_pinned_memory_resource;
 
-/*! \}
+/*! \} // memory_resources
  */
 
-}
-}
-}
+
+}} // namespace system::cpp
+
+THRUST_NAMESPACE_END
+
diff --git a/thrust/system/cpp/pointer.h b/thrust/system/cpp/pointer.h
index 8efeb33c4..f204fa375 100644
--- a/thrust/system/cpp/pointer.h
+++ b/thrust/system/cpp/pointer.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2018 NVIDIA Corporation
+ *  Copyright 2008-2020 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -14,116 +14,37 @@
  *  limitations under the License.
  */
 
+/*! \file thrust/system/cpp/memory.h
+ *  \brief Managing memory associated with Thrust's TBB system.
+ */
+
 #pragma once
 
 #include <thrust/detail/config.h>
+#include <type_traits>
 #include <thrust/system/cpp/detail/execution_policy.h>
-#include <thrust/detail/type_traits.h>
 #include <thrust/detail/pointer.h>
 #include <thrust/detail/reference.h>
 
-namespace thrust
-{
-namespace system
-{
-namespace cpp
-{
-
-template<typename> class pointer;
-
-} // end cpp
-} // end system
-} // end thrust
-
-
-/*! \cond
- */
-
-// specialize thrust::iterator_traits to avoid problems with the name of
-// pointer's constructor shadowing its nested pointer type
-// do this before pointer is defined so the specialization is correctly
-// used inside the definition
-namespace thrust
-{
-
-template<typename Element>
-  struct iterator_traits<thrust::system::cpp::pointer<Element> >
-{
-  private:
-    typedef thrust::system::cpp::pointer<Element> ptr;
-
-  public:
-    typedef typename ptr::iterator_category       iterator_category;
-    typedef typename ptr::value_type              value_type;
-    typedef typename ptr::difference_type         difference_type;
-    typedef ptr                                   pointer;
-    typedef typename ptr::reference               reference;
-}; // end iterator_traits
-
-} // end thrust
-
-/*! \endcond
- */
-
-
-namespace thrust
+THRUST_NAMESPACE_BEGIN
+namespace system { namespace cpp
 {
-namespace system
-{
-
-/*! \addtogroup system_backends Systems
- *  \ingroup system
- *  \{
- */
-
-/*! \namespace thrust::system::cpp
- *  \brief \p thrust::system::cpp is the namespace containing functionality for allocating, manipulating,
- *         and deallocating memory available to Thrust's standard C++ backend system.
- *         The identifiers are provided in a separate namespace underneath <tt>thrust::system</tt>
- *         for import convenience but are also aliased in the top-level <tt>thrust::cpp</tt>
- *         namespace for easy access.
- *
- */
-namespace cpp
-{
-
-// forward declaration of reference for pointer
-template<typename Element> class reference;
-
-/*! \cond
- */
-
-// XXX nvcc + msvc have trouble instantiating reference below
-//     this is a workaround
-namespace detail
-{
-
-template<typename Element>
-  struct reference_msvc_workaround
-{
-  typedef thrust::system::cpp::reference<Element> type;
-}; // end reference_msvc_workaround
-
-} // end detail
-
-/*! \endcond
- */
-
 
-/*! \p pointer stores a pointer to an object allocated in memory available to the cpp system.
- *  This type provides type safety when dispatching standard algorithms on ranges resident
- *  in cpp memory.
+/*! \p cpp::pointer stores a pointer to an object allocated in memory accessible
+ *  by the \p cpp system. This type provides type safety when dispatching
+ *  algorithms on ranges resident in \p cpp memory.
  *
- *  \p pointer has pointer semantics: it may be dereferenced and manipulated with pointer arithmetic.
+ *  \p cpp::pointer has pointer semantics: it may be dereferenced and
+ *  manipulated with pointer arithmetic.
  *
- *  \p pointer can be created with the function \p cpp::malloc, or by explicitly calling its constructor
- *  with a raw pointer.
+ *  \p cpp::pointer can be created with the function \p cpp::malloc, or by
+ *  explicitly calling its constructor with a raw pointer.
  *
- *  The raw pointer encapsulated by a \p pointer may be obtained by eiter its <tt>get</tt> member function
- *  or the \p raw_pointer_cast function.
+ *  The raw pointer encapsulated by a \p cpp::pointer may be obtained by eiter its
+ *  <tt>get</tt> member function or the \p raw_pointer_cast function.
  *
- *  \note \p pointer is not a "smart" pointer; it is the programmer's responsibility to deallocate memory
- *  pointed to by \p pointer.
+ *  \note \p cpp::pointer is not a "smart" pointer; it is the programmer's
+ *        responsibility to deallocate memory pointed to by \p cpp::pointer.
  *
  *  \tparam T specifies the type of the pointee.
  *
@@ -131,221 +52,66 @@ template<typename Element>
  *  \see cpp::free
  *  \see raw_pointer_cast
  */
-template<typename T>
-  class pointer
-    : public thrust::pointer<
-               T,
-               thrust::system::cpp::tag,
-               thrust::system::cpp::reference<T>,
-               thrust::system::cpp::pointer<T>
-             >
-{
-  /*! \cond
-   */
-
-  private:
-    typedef thrust::pointer<
-      T,
-      thrust::system::cpp::tag,
-      //thrust::system::cpp::reference<T>,
-      typename detail::reference_msvc_workaround<T>::type,
-      thrust::system::cpp::pointer<T>
-    > super_t;
-
-  /*! \endcond
-   */
-
-  public:
-    // note that cpp::pointer's member functions need __host__ __device__
-    // to interoperate with nvcc + iterators' dereference member function
-
-    /*! \p pointer's no-argument constructor initializes its encapsulated pointer to \c 0.
-     */
-    __host__ __device__
-    pointer() : super_t() {}
-
-    #if THRUST_CPP_DIALECT >= 2011
-    // NOTE: This is needed so that Thrust smart pointers can be used in
-    // `std::unique_ptr`.
-    __host__ __device__
-    pointer(decltype(nullptr)) : super_t(nullptr) {}
-    #endif
-
-    /*! This constructor allows construction of a <tt>pointer<const T></tt> from a <tt>T*</tt>.
-     *
-     *  \param ptr A raw pointer to copy from, presumed to point to a location in memory
-     *         accessible by the \p cpp system.
-     *  \tparam OtherT \p OtherT shall be convertible to \p T.
-     */
-    template<typename OtherT>
-    __host__ __device__
-    explicit pointer(OtherT *ptr) : super_t(ptr) {}
-
-    /*! This constructor allows construction from another pointer-like object with related type.
-     *
-     *  \param other The \p OtherPointer to copy.
-     *  \tparam OtherPointer The system tag associated with \p OtherPointer shall be convertible
-     *          to \p thrust::system::cpp::tag and its element type shall be convertible to \p T.
-     */
-    template<typename OtherPointer>
-    __host__ __device__
-    pointer(const OtherPointer &other,
-            typename thrust::detail::enable_if_pointer_is_convertible<
-              OtherPointer,
-              pointer
-            >::type * = 0) : super_t(other) {}
-
-    /*! This constructor allows construction from another pointer-like object with \p void type.
-     *
-     *  \param other The \p OtherPointer to copy.
-     *  \tparam OtherPointer The system tag associated with \p OtherPointer shall be convertible
-     *          to \p thrust::system::cpp::tag and its element type shall be \p void.
-     */
-    template<typename OtherPointer>
-    __host__ __device__
-    explicit
-    pointer(const OtherPointer &other,
-            typename thrust::detail::enable_if_void_pointer_is_system_convertible<
-              OtherPointer,
-              pointer
-            >::type * = 0) : super_t(other) {}
-
-    /*! Assignment operator allows assigning from another pointer-like object with related type.
-     *
-     *  \param other The other pointer-like object to assign from.
-     *  \tparam OtherPointer The system tag associated with \p OtherPointer shall be convertible
-     *          to \p thrust::system::cpp::tag and its element type shall be convertible to \p T.
-     */
-    template<typename OtherPointer>
-    __host__ __device__
-    typename thrust::detail::enable_if_pointer_is_convertible<
-      OtherPointer,
-      pointer,
-      pointer &
-    >::type
-    operator=(const OtherPointer &other)
-    {
-      return super_t::operator=(other);
-    }
-
-    #if THRUST_CPP_DIALECT >= 2011
-    // NOTE: This is needed so that Thrust smart pointers can be used in
-    // `std::unique_ptr`.
-    __host__ __device__
-    pointer& operator=(decltype(nullptr))
-    {
-      super_t::operator=(nullptr);
-      return *this;
-    }
-    #endif
-}; // end pointer
-
-/*! \p reference is a wrapped reference to an object stored in memory available to the \p cpp system.
- *  \p reference is the type of the result of dereferencing a \p cpp::pointer.
+template <typename T>
+using pointer = thrust::pointer<
+  T,
+  thrust::system::cpp::tag,
+  thrust::tagged_reference<T, thrust::system::cpp::tag>
+>;
+
+/*! \p cpp::universal_pointer stores a pointer to an object allocated in memory
+ * accessible by the \p cpp system and host systems.
  *
- *  \tparam T Specifies the type of the referenced object.
+ *  \p cpp::universal_pointer has pointer semantics: it may be dereferenced and
+ *  manipulated with pointer arithmetic.
+ *
+ *  \p cpp::universal_pointer can be created with \p cpp::universal_allocator
+ *  or by explicitly calling its constructor with a raw pointer.
+ *
+ *  The raw pointer encapsulated by a \p cpp::universal_pointer may be obtained
+ *  by eiter its <tt>get</tt> member function or the \p raw_pointer_cast
+ *  function.
+ *
+ *  \note \p cpp::universal_pointer is not a "smart" pointer; it is the
+ *        programmer's responsibility to deallocate memory pointed to by
+ *        \p cpp::universal_pointer.
+ *
+ *  \tparam T specifies the type of the pointee.
+ *
+ *  \see cpp::universal_allocator
+ *  \see raw_pointer_cast
  */
-template<typename T>
-  class reference
-    : public thrust::reference<
-               T,
-               thrust::system::cpp::pointer<T>,
-               thrust::system::cpp::reference<T>
-             >
-{
-  /*! \cond
-   */
-
-  private:
-    typedef thrust::reference<
-      T,
-      thrust::system::cpp::pointer<T>,
-      thrust::system::cpp::reference<T>
-    > super_t;
-
-  /*! \endcond
-   */
-
-  public:
-    /*! \cond
-     */
-
-    typedef typename super_t::value_type value_type;
-    typedef typename super_t::pointer    pointer;
-
-    /*! \endcond
-     */
-
-    /*! This constructor initializes this \p reference to refer to an object
-     *  pointed to by the given \p pointer. After this \p reference is constructed,
-     *  it shall refer to the object pointed to by \p ptr.
-     *
-     *  \param ptr A \p pointer to copy from.
-     */
-    __host__ __device__
-    explicit reference(const pointer &ptr)
-      : super_t(ptr)
-    {}
-
-    /*! This constructor accepts a const reference to another \p reference of related type.
-     *  After this \p reference is constructed, it shall refer to the same object as \p other.
-     *
-     *  \param other A \p reference to copy from.
-     *  \tparam OtherT The element type of the other \p reference.
-     *
-     *  \note This constructor is templated primarily to allow initialization of <tt>reference<const T></tt>
-     *        from <tt>reference<T></tt>.
-     */
-    template<typename OtherT>
-    __host__ __device__
-    reference(const reference<OtherT> &other,
-              typename thrust::detail::enable_if_convertible<
-                typename reference<OtherT>::pointer,
-                pointer
-              >::type * = 0)
-      : super_t(other)
-    {}
-
-    /*! Copy assignment operator copy assigns from another \p reference of related type.
-     *
-     *  \param other The other \p reference to assign from.
-     *  \return <tt>*this</tt>
-     *  \tparam OtherT The element type of the other \p reference.
-     */
-    template<typename OtherT>
-    reference &operator=(const reference<OtherT> &other);
-
-    /*! Assignment operator assigns from a \p value_type.
-     *
-     *  \param x The \p value_type to assign from.
-     *  \return <tt>*this</tt>
-     */
-    reference &operator=(const value_type &x);
-}; // end reference
-
-/*! Exchanges the values of two objects referred to by \p reference.
- *  \p x The first \p reference of interest.
- *  \p y The second \p reference of interest.
+template <typename T>
+using universal_pointer = thrust::pointer<
+  T,
+  thrust::system::cpp::tag,
+  typename std::add_lvalue_reference<T>::type
+>;
+
+/*! \p reference is a wrapped reference to an object stored in memory available
+ *  to the \p cpp system. \p reference is the type of the result of
+ *  dereferencing a \p cpp::pointer.
+ *
+ *  \tparam T Specifies the type of the referenced object.
  */
-template<typename T>
-__host__ __device__
-void swap(reference<T> x, reference<T> y);
+template <typename T>
+using reference = thrust::reference<T, thrust::system::cpp::tag>;
 
-} // end cpp
+}} // namespace system::cpp
 
-/*! \}
+/*! \addtogroup system_backends Systems
+ *  \ingroup system
+ *  \{
  */
 
-} // end system
-
+/*! \namespace thrust::cpp
+ *  \brief \p thrust::cpp is a top-level alias for \p thrust::system::cpp. */
 namespace cpp
 {
-
 using thrust::system::cpp::pointer;
+using thrust::system::cpp::universal_pointer;
 using thrust::system::cpp::reference;
+} // namespace cpp
 
-} // end cpp
-
-} // end thrust
+THRUST_NAMESPACE_END
 
-#include <thrust/system/cpp/detail/pointer.inl>
diff --git a/thrust/system/cpp/vector.h b/thrust/system/cpp/vector.h
index 1748f3d6f..2a418dbc3 100644
--- a/thrust/system/cpp/vector.h
+++ b/thrust/system/cpp/vector.h
@@ -26,145 +26,57 @@
 #include <thrust/detail/vector_base.h>
 #include <vector>
 
-namespace thrust
+THRUST_NAMESPACE_BEGIN
+namespace system { namespace cpp
 {
 
-// forward declaration of host_vector
-template<typename T, typename Allocator> class host_vector;
-
-namespace system
-{
-namespace cpp
-{
-
-// XXX upon c++11
-// template<typename T, typename Allocator = allocator<T> > using vector = thrust::detail::vector_base<T,Allocator>;
-
 /*! \p cpp::vector is a container that supports random access to elements,
  *  constant time removal of elements at the end, and linear time insertion
  *  and removal of elements at the beginning or in the middle. The number of
  *  elements in a \p cpp::vector may vary dynamically; memory management is
  *  automatic. The elements contained in a \p cpp::vector reside in memory
- *  available to the \p cpp system.
+ *  accessible by the \p cpp system.
  *
  *  \tparam T The element type of the \p cpp::vector.
- *  \tparam Allocator The allocator type of the \p cpp::vector. Defaults to \p cpp::allocator.
+ *  \tparam Allocator The allocator type of the \p cpp::vector.
+ *          Defaults to \p cpp::allocator.
  *
- *  \see http://www.sgi.com/tech/stl/Vector.html
+ *  \see https://en.cppreference.com/w/cpp/container/vector
  *  \see host_vector For the documentation of the complete interface which is
- *                   shared by \p cpp::vector
+ *                   shared by \p cpp::vector.
  *  \see device_vector
+ *  \see universal_vector
  */
-template<typename T, typename Allocator = allocator<T> >
-  class vector
-    : public thrust::detail::vector_base<T,Allocator>
-{
-  /*! \cond
-   */
-  private:
-    typedef thrust::detail::vector_base<T,Allocator> super_t;
-  /*! \endcond
-   */
-
-  public:
-
-  /*! \cond
-   */
-    typedef typename super_t::size_type  size_type;
-    typedef typename super_t::value_type value_type;
-
-  /*! \endcond
-   */
-
-    /*! This constructor creates an empty \p cpp::vector.
-     */
-    vector();
-
-    /*! This constructor creates a \p cpp::vector with \p n default-constructed elements.
-     *  \param n The size of the \p cpp::vector to create.
-     */
-    explicit vector(size_type n);
-
-    /*! This constructor creates a \p cpp::vector with \p n copies of \p value.
-     *  \param n The size of the \p cpp::vector to create.
-     *  \param value An element to copy.
-     */
-    explicit vector(size_type n, const value_type &value);
-
-    /*! Copy constructor copies from another \p cpp::vector.
-     *  \param x The other \p cpp::vector to copy.
-     */
-    vector(const vector &x);
-
-  #if __cplusplus >= 201103L
-    /*! Move constructor moves from over another \p cpp::vector.
-     *  \param x The other \p cpp::vector to move from.
-     */
-    vector(vector &&x);
-  #endif
-
-    /*! This constructor copies from another Thrust vector-like object.
-     *  \param x The other object to copy from.
-     */
-    template<typename OtherT, typename OtherAllocator>
-    vector(const thrust::detail::vector_base<OtherT,OtherAllocator> &x);
-
-    /*! This constructor copies from a \c std::vector.
-     *  \param x The \c std::vector to copy from.
-     */
-    template<typename OtherT, typename OtherAllocator>
-    vector(const std::vector<OtherT,OtherAllocator> &x);
-
-    /*! This constructor creates a \p cpp::vector by copying from a range.
-     *  \param first The beginning of the range.
-     *  \param last The end of the range.
-     */
-    template<typename InputIterator>
-    vector(InputIterator first, InputIterator last);
-
-    // XXX vector_base should take a Derived type so we don't have to define these superfluous assigns
-
-    /*! Assignment operator assigns from another \p cpp::vector.
-     *  \param x The other object to assign from.
-     *  \return <tt>*this</tt>
-     */
-    vector &operator=(const vector &x);
-
-  #if __cplusplus >= 201103L
-    /*! Move assignment operator moves from another \p cpp::vector.
-     *  \param x The other \p cpp::vector to move from.
-     *  \return <tt>*this</tt>
-     */
-     vector &operator=(vector &&x);
-  #endif
-
-    /*! Assignment operator assigns from a \c std::vector.
-     *  \param x The \c std::vector to assign from.
-     *  \return <tt>*this</tt>
-     */
-    template<typename OtherT, typename OtherAllocator>
-    vector &operator=(const std::vector<OtherT,OtherAllocator> &x);
-
-    /*! Assignment operator assigns from another Thrust vector-like object.
-     *  \param x The other object to assign from.
-     *  \return <tt>*this</tt>
-     */
-    template<typename OtherT, typename OtherAllocator>
-    vector &operator=(const thrust::detail::vector_base<OtherT,OtherAllocator> &x);
-}; // end vector
+template <typename T, typename Allocator = thrust::system::cpp::allocator<T>>
+using vector = thrust::detail::vector_base<T, Allocator>;
+
+/*! \p cpp::universal_vector is a container that supports random access to
+ *  elements, constant time removal of elements at the end, and linear time
+ *  insertion and removal of elements at the beginning or in the middle. The
+ *  number of elements in a \p cpp::universal_vector may vary dynamically;
+ *  memory management is automatic. The elements contained in a
+ *  \p cpp::universal_vector reside in memory accessible by the \p cpp system
+ *  and host systems.
+ *
+ *  \tparam T The element type of the \p cpp::universal_vector.
+ *  \tparam Allocator The allocator type of the \p cpp::universal_vector.
+ *          Defaults to \p cpp::universal_allocator.
+ *
+ *  \see https://en.cppreference.com/w/cpp/container/vector
+ *  \see host_vector For the documentation of the complete interface which is
+ *                   shared by \p cpp::universal_vector
+ *  \see device_vector
+ *  \see universal_vector
+ */
+template <typename T, typename Allocator = thrust::system::cpp::universal_allocator<T>>
+using universal_vector = thrust::detail::vector_base<T, Allocator>;
 
-} // end cpp
-} // end system
+}} // namespace system::cpp
 
-// alias system::cpp names at top-level
 namespace cpp
 {
-
 using thrust::system::cpp::vector;
+using thrust::system::cpp::universal_vector;
+}
 
-} // end cpp
-
-} // end thrust
-
-#include <thrust/system/cpp/detail/vector.inl>
-
+THRUST_NAMESPACE_END
diff --git a/thrust/system/cuda/config.h b/thrust/system/cuda/config.h
index b64e0c8b7..f6c8b9cb3 100644
--- a/thrust/system/cuda/config.h
+++ b/thrust/system/cuda/config.h
@@ -26,26 +26,71 @@
  ******************************************************************************/
 #pragma once
 
+
+#ifdef THRUST_DEBUG_SYNC
+#define THRUST_DEBUG_SYNC_FLAG true
+#define CUB_DEBUG_SYNC
+#else
+#define THRUST_DEBUG_SYNC_FLAG false
+#endif
+
+
 #include <thrust/detail/config.h>
 
-#define THRUST_UNUSED_VAR(expr) do { (void)(expr); } while (0)
+// We don't directly include <cub/version.cuh> since it doesn't exist in
+// older releases. This header will always pull in version info:
+#include <cub/util_namespace.cuh>
+#include <cub/util_debug.cuh>
+
+#include <cub/detail/detect_cuda_runtime.cuh>
+
+/**
+ * \def THRUST_RUNTIME_FUNCTION
+ *
+ * Execution space for functions that can use the CUDA runtime API (`__host__`
+ * when RDC is off, `__host__ __device__` when RDC is on).
+ */
+#define THRUST_RUNTIME_FUNCTION CUB_RUNTIME_FUNCTION
+
+/**
+ * \def THRUST_RDC_ENABLED
+ *
+ * Defined if RDC is enabled.
+ */
+#ifdef CUB_RDC_ENABLED
+#define THRUST_RDC_ENABLED
+#endif
 
-#if defined(__CUDACC__)
-#  if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__>= 350 && defined(__CUDACC_RDC__))
-#    define __THRUST_HAS_CUDART__ 1
-#    define THRUST_RUNTIME_FUNCTION __host__ __device__ __forceinline__
-#  else
-#    define __THRUST_HAS_CUDART__ 0
-#    define THRUST_RUNTIME_FUNCTION __host__ __forceinline__
-#  endif
+/**
+ * \def __THRUST_HAS_CUDART__
+ *
+ * Whether or not the active compiler pass is allowed to invoke device kernels
+ * or methods from the CUDA runtime API.
+ *
+ * This macro should not be used in Thrust, as it depends on `__CUDA_ARCH__`
+ * and is not compatible with `NV_IF_TARGET`. It is provided for legacy
+ * purposes only.
+ *
+ * Replace any usages with `THRUST_RDC_ENABLED` and `NV_IF_TARGET`.
+ */
+#ifdef CUB_RUNTIME_ENABLED
+#define __THRUST_HAS_CUDART__ 1
 #else
-#  define __THRUST_HAS_CUDART__ 0
-#  define THRUST_RUNTIME_FUNCTION __host__ __forceinline__
+#define __THRUST_HAS_CUDART__ 0
 #endif
 
+// These definitions were intended for internal use only and are now obsolete.
+// If you relied on them, consider porting your code to use the functionality
+// in libcu++'s <nv/target> header.
+//
+// For a temporary workaround, define THRUST_PROVIDE_LEGACY_ARCH_MACROS to make
+// them available again. These should be considered deprecated and will be
+// fully removed in a future version.
+#ifdef THRUST_PROVIDE_LEGACY_ARCH_MACROS
 #ifdef __CUDA_ARCH__
 #define THRUST_DEVICE_CODE
-#endif
+#endif // __CUDA_ARCH__
+#endif // THRUST_PROVIDE_LEGACY_ARCH_MACROS
 
 #ifdef THRUST_AGENT_ENTRY_NOINLINE
 #define THRUST_AGENT_ENTRY_INLINE_ATTR __noinline__
@@ -64,12 +109,38 @@
 #define THRUST_AGENT_ENTRY(...) THRUST_AGENT_ENTRY_INLINE_ATTR __device__ static void entry(__VA_ARGS__)
 #endif
 
-#ifdef THRUST_DEBUG_SYNC
-#define THRUST_DEBUG_SYNC_FLAG true
-#else
-#define THRUST_DEBUG_SYNC_FLAG false
+#ifndef THRUST_IGNORE_CUB_VERSION_CHECK
+
+#include <thrust/version.h>
+#if THRUST_VERSION != CUB_VERSION
+#error The version of CUB in your include path is not compatible with this release of Thrust. CUB is now included in the CUDA Toolkit, so you no longer need to use your own checkout of CUB. Define THRUST_IGNORE_CUB_VERSION_CHECK to ignore this.
+#endif
+
+// Make sure the CUB namespace has been declared using the modern macros:
+CUB_NAMESPACE_BEGIN
+CUB_NAMESPACE_END
+
+#else // THRUST_IGNORE_CUB_VERSION_CHECK
+
+// Make sure the CUB namespace has been declared. Use the old macros for compat
+// with older CUB:
+CUB_NS_PREFIX
+namespace cub {}
+CUB_NS_POSTFIX
+
+// Older versions of CUB do not define this. Set it to a reasonable default if
+// not provided.
+#ifndef CUB_NS_QUALIFIER
+#define CUB_NS_QUALIFIER ::cub
 #endif
 
-#define THRUST_CUB_NS_PREFIX namespace thrust {   namespace cuda_cub {
-#define THRUST_CUB_NS_POSTFIX }  }
+#endif // THRUST_IGNORE_CUB_VERSION_CHECK
 
+// Pull the fully qualified cub:: namespace into the thrust:: namespace so we
+// don't have to use CUB_NS_QUALIFIER as long as we're in thrust::.
+THRUST_NAMESPACE_BEGIN
+namespace cub
+{
+using namespace CUB_NS_QUALIFIER;
+}
+THRUST_NAMESPACE_END
diff --git a/thrust/system/cuda/detail/adjacent_difference.h b/thrust/system/cuda/detail/adjacent_difference.h
index 6e1ac05ca..284611235 100644
--- a/thrust/system/cuda/detail/adjacent_difference.h
+++ b/thrust/system/cuda/detail/adjacent_difference.h
@@ -26,23 +26,28 @@
  ******************************************************************************/
 #pragma once
 
+#include <thrust/detail/config.h>
 
 #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
-#include <thrust/system/cuda/config.h>
 
 #include <thrust/detail/cstdint.h>
+#include <thrust/detail/minmax.h>
 #include <thrust/detail/temporary_array.h>
-#include <thrust/system/cuda/detail/util.h>
-#include <thrust/system/cuda/detail/cub/device/device_select.cuh>
-#include <thrust/system/cuda/detail/cub/block/block_adjacent_difference.cuh>
-#include <thrust/system/cuda/detail/core/agent_launcher.h>
-#include <thrust/system/cuda/detail/par_to_seq.h>
+#include <thrust/detail/type_traits.h>
 #include <thrust/functional.h>
-#include <thrust/distance.h>
-#include <thrust/detail/mpl/math.h>
-#include <thrust/detail/minmax.h>
+#include <thrust/system/cuda/config.h>
+#include <thrust/system/cuda/detail/cdp_dispatch.h>
+#include <thrust/system/cuda/detail/dispatch.h>
+#include <thrust/system/cuda/detail/par_to_seq.h>
+#include <thrust/system/cuda/detail/util.h>
+#include <thrust/type_traits/is_contiguous_iterator.h>
+#include <thrust/type_traits/remove_cvref.h>
 
-THRUST_BEGIN_NS
+#include <cub/device/device_adjacent_difference.cuh>
+#include <cub/device/device_select.cuh>
+#include <cub/util_math.cuh>
+
+THRUST_NAMESPACE_BEGIN
 
 template <typename DerivedPolicy, typename InputIterator, typename OutputIterator, typename BinaryFunction>
 __host__ __device__ OutputIterator
@@ -57,378 +62,114 @@ namespace cuda_cub {
 
 namespace __adjacent_difference {
 
-  namespace mpl = thrust::detail::mpl::math;
-
-  template <int                      _BLOCK_THREADS,
-            int                      _ITEMS_PER_THREAD = 1,
-            cub::BlockLoadAlgorithm  _LOAD_ALGORITHM   = cub::BLOCK_LOAD_DIRECT,
-            cub::CacheLoadModifier   _LOAD_MODIFIER    = cub::LOAD_DEFAULT,
-            cub::BlockStoreAlgorithm _STORE_ALGORITHM  = cub::BLOCK_STORE_DIRECT,
-            int                      _MIN_BLOCKS       = 1>
-  struct PtxPolicy
-  {
-    enum
-    {
-      BLOCK_THREADS    = _BLOCK_THREADS,
-      ITEMS_PER_THREAD = _ITEMS_PER_THREAD,
-      ITEMS_PER_TILE   = BLOCK_THREADS * ITEMS_PER_THREAD,
-      MIN_BLOCKS       = _MIN_BLOCKS
-    };
-
-    static const cub::BlockLoadAlgorithm  LOAD_ALGORITHM  = _LOAD_ALGORITHM;
-    static const cub::CacheLoadModifier   LOAD_MODIFIER   = _LOAD_MODIFIER;
-    static const cub::BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM;
-  };
-
-  template<int INPUT_SIZE, int NOMINAL_4B_ITEMS_PER_THREAD>
-  struct items_per_thread
-  {
-    enum
-    {
-      value = (INPUT_SIZE <= 8)
-                  ? NOMINAL_4B_ITEMS_PER_THREAD
-                  : mpl::min<
-                        int,
-                        NOMINAL_4B_ITEMS_PER_THREAD,
-                        mpl::max<int,
-                                 1,
-                                 ((NOMINAL_4B_ITEMS_PER_THREAD * 8) +
-                                  INPUT_SIZE - 1) /
-                                     INPUT_SIZE>::value>::value
-    };
-  };
-
-  template<class Arch, class T>
-  struct Tuning;
-  
-  template <class T>
-  struct Tuning<sm30, T>
-  {
-    enum
-    {
-      INPUT_SIZE                  = sizeof(T),
-      NOMINAL_4B_ITEMS_PER_THREAD = 7,
-      ITEMS_PER_THREAD            = items_per_thread<INPUT_SIZE,
-                                          NOMINAL_4B_ITEMS_PER_THREAD>::value
-    };
-    typedef PtxPolicy<128,
-                      ITEMS_PER_THREAD,
-                      cub::BLOCK_LOAD_WARP_TRANSPOSE,
-                      cub::LOAD_DEFAULT,
-                      cub::BLOCK_STORE_WARP_TRANSPOSE>
-        type;
-  };
-  template <class T>
-  struct Tuning<sm35, T> : Tuning<sm30,T>
-  {
-    enum
-    {
-      NOMINAL_4B_ITEMS_PER_THREAD = 7,
-      ITEMS_PER_THREAD            = items_per_thread<Tuning::INPUT_SIZE,
-                                          NOMINAL_4B_ITEMS_PER_THREAD>::value
-    };
-    typedef PtxPolicy<128,
-                      ITEMS_PER_THREAD,
-                      cub::BLOCK_LOAD_WARP_TRANSPOSE,
-                      cub::LOAD_LDG,
-                      cub::BLOCK_STORE_WARP_TRANSPOSE>
-        type;
-  };
-
-  template <class InputIt,
+  template <bool MayAlias,
+            class InputIt,
             class OutputIt,
-            class Size,
             class BinaryOp>
-  struct AdjacentDifferenceAgent
+  cudaError_t THRUST_RUNTIME_FUNCTION
+  doit_step(void *d_temp_storage,
+            size_t &temp_storage_bytes,
+            InputIt first,
+            OutputIt result,
+            BinaryOp binary_op,
+            std::size_t num_items,
+            cudaStream_t stream)
   {
-    typedef typename iterator_traits<InputIt>::value_type input_type;
-
-    // XXX output type must be result of BinaryOp(input_type,input_type);
-    typedef input_type output_type;
-
-    template<class Arch>
-    struct PtxPlan : Tuning<Arch,input_type>::type
-    {
-      typedef Tuning<Arch,input_type> tuning;
-
-      typedef typename core::LoadIterator<PtxPlan, InputIt>::type LoadIt;
-      typedef typename core::BlockLoad<PtxPlan, LoadIt>::type     BlockLoad;
-
-      typedef typename core::BlockStore<PtxPlan, OutputIt, input_type>::type
-          BlockStore;
-
-      typedef cub::BlockAdjacentDifference<input_type,
-                                           PtxPlan::BLOCK_THREADS,
-                                           1,
-                                           1,
-                                           Arch::ver>
-          BlockAdjacentDifference;
-
-      union TempStorage
-      {
-        typename BlockAdjacentDifference::TempStorage discontinuity;
-        typename BlockLoad::TempStorage                load;
-        typename BlockStore::TempStorage               store;
-      }; // union TempStorage
-    }; // struct PtxPlan
-
-    typedef typename core::specialize_plan_msvc10_war<PtxPlan>::type::type ptx_plan;
-
-    typedef typename ptx_plan::LoadIt      LoadIt;
-    typedef typename ptx_plan::BlockLoad   BlockLoad;
-    typedef typename ptx_plan::BlockStore  BlockStore;
-    typedef typename ptx_plan::BlockAdjacentDifference BlockAdjacentDifference;
-    typedef typename ptx_plan::TempStorage TempStorage;
-
-
-    enum
+    if (num_items == 0)
     {
-      ITEMS_PER_THREAD = ptx_plan::ITEMS_PER_THREAD,
-      BLOCK_THREADS    = ptx_plan::BLOCK_THREADS,
-      ITEMS_PER_TILE   = ptx_plan::ITEMS_PER_TILE,
-    };
+      return cudaSuccess;
+    }
 
-    struct impl
-    {
+    constexpr bool may_alias = MayAlias;
+    constexpr bool read_left = true;
+
+    using Dispatch32 = cub::DispatchAdjacentDifference<InputIt,
+                                                       OutputIt,
+                                                       BinaryOp,
+                                                       thrust::detail::int32_t,
+                                                       may_alias,
+                                                       read_left>;
+    using Dispatch64 = cub::DispatchAdjacentDifference<InputIt,
+                                                       OutputIt,
+                                                       BinaryOp,
+                                                       thrust::detail::int64_t,
+                                                       may_alias,
+                                                       read_left>;
 
-      //---------------------------------------------------------------------
-      // Per-thread fields
-      //---------------------------------------------------------------------
-
-      TempStorage &temp_storage;
-      LoadIt       load_it;                // iterator to the first element
-      input_type * first_tile_previous;    // iterator to the first element of previous tile value
-      OutputIt     output_it;
-      BinaryOp     binary_op;
-
-      template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
-      void THRUST_DEVICE_FUNCTION
-      consume_tile_impl(int  num_remaining,
-                        int  tile_idx,
-                        Size tile_base)
-      {
-        input_type  input[ITEMS_PER_THREAD];
-        input_type  input_prev[ITEMS_PER_THREAD];
-        output_type output[ITEMS_PER_THREAD];
-
-        if (IS_LAST_TILE)
-        {
-          // Fill last elements with the first element
-          // because collectives are not suffix guarded
-          BlockLoad(temp_storage.load)
-              .Load(load_it + tile_base,
-                    input,
-                    num_remaining,
-                    *(load_it + tile_base));
-        }
-        else
-        {
-          BlockLoad(temp_storage.load).Load(load_it + tile_base, input);
-        }
-
-
-        core::sync_threadblock();
-
-        if (IS_FIRST_TILE)
-        {
-          BlockAdjacentDifference(temp_storage.discontinuity)
-              .FlagHeads(output, input, input_prev, binary_op);
-          if (threadIdx.x == 0)
-            output[0] = input[0];
-        }
-        else
-        {
-          input_type tile_prev_input = first_tile_previous[tile_idx];
-          BlockAdjacentDifference(temp_storage.discontinuity)
-              .FlagHeads(output, input, input_prev, binary_op, tile_prev_input);
-        }
-
-        core::sync_threadblock();
-
-        if (IS_LAST_TILE)
-        {
-          BlockStore(temp_storage.store)
-              .Store(output_it + tile_base, output, num_remaining);
-        }
-        else
-        {
-          BlockStore(temp_storage.store).Store(output_it + tile_base, output);
-        }
-      }
-
-
-      template <bool IS_LAST_TILE>
-      void THRUST_DEVICE_FUNCTION
-      consume_tile(Size num_remaining,
-                   Size  tile_idx,
-                   Size tile_base)
-      {
-        if (tile_idx == 0)
-        {
-          consume_tile_impl<IS_LAST_TILE, true>(num_remaining,
-                                                tile_idx,
-                                                tile_base);
-        }
-        else
-        {
-          consume_tile_impl<IS_LAST_TILE, false>(num_remaining,
-                                                 tile_idx,
-                                                 tile_base);
-        }
-      }
-
-      void THRUST_DEVICE_FUNCTION
-      consume_range(Size num_items)
-      {
-        int  tile_idx      = blockIdx.x;
-        Size tile_base     = tile_idx * ITEMS_PER_TILE;
-        Size num_remaining = num_items - tile_base;
-
-        if (num_remaining > ITEMS_PER_TILE)    // not a last tile
-        {
-          consume_tile<false>(num_remaining, tile_idx, tile_base);
-        }
-        else if (num_remaining > 0)
-        {
-          consume_tile<true>(num_remaining, tile_idx, tile_base);
-        }
-      }
-
-      //---------------------------------------------------------------------
-      // Constructor
-      //---------------------------------------------------------------------
-
-      THRUST_DEVICE_FUNCTION
-      impl(TempStorage &temp_storage_,
-           InputIt      input_it_,
-           input_type * first_tile_previous_,
-           OutputIt     result_,
-           BinaryOp     binary_op_,
-           Size         num_items)
-          : temp_storage(temp_storage_),
-            load_it(core::make_load_iterator(ptx_plan(), input_it_)),
-            first_tile_previous(first_tile_previous_),
-            output_it(result_),
-            binary_op(binary_op_)
-      {
-        consume_range(num_items);
-      }
-    };    // struct impl
-
-    //---------------------------------------------------------------------
-    // Agent entry point
-    //---------------------------------------------------------------------
-
-    THRUST_AGENT_ENTRY(InputIt     first,
-                       input_type *first_element,
-                       OutputIt    result,
-                       BinaryOp    binary_op,
-                       Size        num_items,
-                       char *      shmem)
-    {
-      TempStorage &storage = *reinterpret_cast<TempStorage *>(shmem);
-      impl(storage, first, first_element, result, binary_op, num_items);
-    }
-  }; // struct AdjacentDifferenceAgent
+    cudaError_t status;
+    THRUST_INDEX_TYPE_DISPATCH2(status,
+                                Dispatch32::Dispatch,
+                                Dispatch64::Dispatch,
+                                num_items,
+                                (d_temp_storage,
+                                 temp_storage_bytes,
+                                 first,
+                                 result,
+                                 num_items_fixed,
+                                 binary_op,
+                                 stream));
+    return status;
+  }
 
   template <class InputIt,
             class OutputIt,
-            class Size>
-  struct InitAgent
+            class BinaryOp>
+  cudaError_t THRUST_RUNTIME_FUNCTION
+  doit_step(void *d_temp_storage,
+            size_t &temp_storage_bytes,
+            InputIt first,
+            OutputIt result,
+            BinaryOp binary_op,
+            std::size_t num_items,
+            cudaStream_t stream,
+            thrust::detail::integral_constant<bool, false> /* comparable */)
   {
-    template <class Arch>
-    struct PtxPlan : PtxPolicy<128> {};
-    typedef core::specialize_plan<PtxPlan> ptx_plan;
-
-    //---------------------------------------------------------------------
-    // Agent entry point
-    //---------------------------------------------------------------------
-
-    THRUST_AGENT_ENTRY(InputIt  first,
-                       OutputIt result,
-                       Size     num_tiles,
-                       int      items_per_tile,
-                       char *   /*shmem*/)
-    {
-      int tile_idx  = blockIdx.x * blockDim.x + threadIdx.x;
-      int tile_base = tile_idx * items_per_tile;
-      if (tile_base > 0 && tile_idx < num_tiles)
-        result[tile_idx] = first[tile_base - 1];
-    }
-  }; // struct InitAgent
+    constexpr bool may_alias = true;
+    return doit_step<may_alias>(d_temp_storage,
+                                temp_storage_bytes,
+                                first,
+                                result,
+                                binary_op,
+                                num_items,
+                                stream);
+  }
 
   template <class InputIt,
             class OutputIt,
-            class BinaryOp,
-            class Size>
+            class BinaryOp>
   cudaError_t THRUST_RUNTIME_FUNCTION
-  doit_step(void *       d_temp_storage,
-            size_t &     temp_storage_bytes,
-            InputIt      first,
-            OutputIt     result,
-            BinaryOp     binary_op,
-            Size         num_items,
+  doit_step(void *d_temp_storage,
+            size_t &temp_storage_bytes,
+            InputIt first,
+            OutputIt result,
+            BinaryOp binary_op,
+            std::size_t num_items,
             cudaStream_t stream,
-            bool         debug_sync)
+            thrust::detail::integral_constant<bool, true> /* comparable */)
   {
-    if (num_items == 0)
-      return cudaSuccess;
-
-    using core::AgentPlan;
-    using core::AgentLauncher;
-
-    cudaError_t status = cudaSuccess;
-
-    typedef AgentLauncher<
-        AdjacentDifferenceAgent<InputIt,
-                                OutputIt,
-                                Size,
-                                BinaryOp> >
-        difference_agent;
-
-    typedef typename iterator_traits<InputIt>::value_type input_type;
-    typedef AgentLauncher<InitAgent<InputIt, input_type *, Size> > init_agent;
-
-    AgentPlan difference_plan = difference_agent::get_plan(stream);
-    AgentPlan init_plan       = init_agent::get_plan();
-
-
-    size_t tile_size = difference_plan.items_per_tile;
-    size_t num_tiles = (num_items + tile_size - 1) / tile_size;
-
-    size_t tmp1        = num_tiles * sizeof(input_type);
-    size_t vshmem_size = core::vshmem_size(difference_plan.shared_memory_size,
-                                           num_tiles);
-
-    size_t allocation_sizes[2] = {tmp1, vshmem_size};
-    void * allocations[2]      = {NULL, NULL};
-
-    status = core::alias_storage(d_temp_storage,
-                                 temp_storage_bytes,
-                                 allocations,
-                                 allocation_sizes);
-    CUDA_CUB_RET_IF_FAIL(status);
-
-    if (d_temp_storage == NULL)
+    // The documentation states that pointers might be equal but can't alias in
+    // any other way. That is, the distance should be equal to zero or exceed
+    // `num_items`. In the latter case, we use an optimized version.
+    if (first != result)
     {
-      return status;
+      constexpr bool may_alias = false;
+      return doit_step<may_alias>(d_temp_storage,
+                                  temp_storage_bytes,
+                                  first,
+                                  result,
+                                  binary_op,
+                                  num_items,
+                                  stream);
     }
 
-    input_type *first_tile_previous = (input_type *)allocations[0];
-    char *vshmem_ptr = vshmem_size > 0 ? (char *)allocations[1] : NULL;
-
-    init_agent ia(init_plan, num_tiles, stream, "adjacent_difference::init_agent", debug_sync);
-    ia.launch(first, first_tile_previous, num_tiles, tile_size);
-    CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
-
-    difference_agent da(difference_plan, num_items, stream, vshmem_ptr, "adjacent_difference::difference_agent", debug_sync);
-    da.launch(first,
-              first_tile_previous,
-              result,
-              binary_op,
-              num_items);
-    CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
-    return status;
+    constexpr bool may_alias = true;
+    return doit_step<may_alias>(d_temp_storage,
+                                temp_storage_bytes,
+                                first,
+                                result,
+                                binary_op,
+                                num_items,
+                                stream);
   }
 
   template <typename Derived,
@@ -442,40 +183,52 @@ namespace __adjacent_difference {
                       OutputIt                   result,
                       BinaryOp                   binary_op)
   {
-    typedef typename iterator_traits<InputIt>::difference_type size_type;
-
-    size_type    num_items    = thrust::distance(first, last);
-    size_t       storage_size = 0;
-    cudaStream_t stream       = cuda_cub::stream(policy);
-    bool         debug_sync   = THRUST_DEBUG_SYNC_FLAG;
-
-    cudaError_t status;
-    status = doit_step(NULL,
-                       storage_size,
-                       first,
-                       result,
-                       binary_op,
-                       num_items,
-                       stream,
-                       debug_sync);
+    const auto num_items =
+      static_cast<std::size_t>(thrust::distance(first, last));
+    std::size_t storage_size = 0;
+    cudaStream_t stream = cuda_cub::stream(policy);
+
+    using UnwrapInputIt = thrust::detail::try_unwrap_contiguous_iterator_return_t<InputIt>;
+    using UnwrapOutputIt = thrust::detail::try_unwrap_contiguous_iterator_return_t<OutputIt>;
+
+    using InputValueT = thrust::iterator_value_t<UnwrapInputIt>;
+    using OutputValueT = thrust::iterator_value_t<UnwrapOutputIt>;
+
+    constexpr bool can_compare_iterators =
+      std::is_pointer<UnwrapInputIt>::value &&
+      std::is_pointer<UnwrapOutputIt>::value &&
+      std::is_same<InputValueT, OutputValueT>::value;
+
+    auto first_unwrap = thrust::detail::try_unwrap_contiguous_iterator(first);
+    auto result_unwrap = thrust::detail::try_unwrap_contiguous_iterator(result);
+
+    thrust::detail::integral_constant<bool, can_compare_iterators> comparable;
+
+    cudaError_t status = doit_step(nullptr,
+                                   storage_size,
+                                   first_unwrap,
+                                   result_unwrap,
+                                   binary_op,
+                                   num_items,
+                                   stream,
+                                   comparable);
     cuda_cub::throw_on_error(status, "adjacent_difference failed on 1st step");
 
     // Allocate temporary storage.
     thrust::detail::temporary_array<thrust::detail::uint8_t, Derived>
       tmp(policy, storage_size);
-    void *ptr = static_cast<void*>(tmp.data().get());
 
-    status = doit_step(ptr,
+    status = doit_step(static_cast<void *>(tmp.data().get()),
                        storage_size,
-                       first,
-                       result,
+                       first_unwrap,
+                       result_unwrap,
                        binary_op,
                        num_items,
                        stream,
-                       debug_sync);
+                       comparable);
     cuda_cub::throw_on_error(status, "adjacent_difference failed on 2nd step");
 
-    status = cuda_cub::synchronize(policy);
+    status = cuda_cub::synchronize_optional(policy);
     cuda_cub::throw_on_error(status, "adjacent_difference failed to synchronize");
 
     return result + num_items;
@@ -499,28 +252,19 @@ adjacent_difference(execution_policy<Derived> &policy,
                     OutputIt                   result,
                     BinaryOp                   binary_op)
 {
-  OutputIt ret = result;
-  if (__THRUST_HAS_CUDART__)
-  {
-    ret = __adjacent_difference::adjacent_difference(policy,
-        first,
-        last,
-        result,
-        binary_op);
-  }
-  else
-  {
-#if !__THRUST_HAS_CUDART__
-    ret = thrust::adjacent_difference(cvt_to_seq(derived_cast(policy)),
-                                      first,
-                                      last,
-                                      result,
-                                      binary_op);
-#endif
-  }
-
-  return ret;
-} 
+  THRUST_CDP_DISPATCH(
+    (result = __adjacent_difference::adjacent_difference(policy,
+                                                         first,
+                                                         last,
+                                                         result,
+                                                         binary_op);),
+    (result = thrust::adjacent_difference(cvt_to_seq(derived_cast(policy)),
+                                          first,
+                                          last,
+                                          result,
+                                          binary_op);));
+  return result;
+}
 
 template <class Derived,
           class InputIt,
@@ -541,7 +285,7 @@ adjacent_difference(execution_policy<Derived> &policy,
 
 
 } // namespace cuda_cub
-THRUST_END_NS
+THRUST_NAMESPACE_END
 
 //
 #include <thrust/memory.h>
diff --git a/thrust/system/cuda/detail/assign_value.h b/thrust/system/cuda/detail/assign_value.h
index 601700cb5..8945f1cac 100644
--- a/thrust/system/cuda/detail/assign_value.h
+++ b/thrust/system/cuda/detail/assign_value.h
@@ -16,15 +16,17 @@
 
 #pragma once
 
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
 #include <thrust/detail/config.h>
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
 #include <thrust/system/cuda/config.h>
 #include <thrust/system/cuda/detail/execution_policy.h>
 #include <thrust/detail/raw_pointer_cast.h>
 #include <thrust/system/cuda/detail/copy.h>
 
+#include <nv/target>
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
 namespace cuda_cub {
 
 
@@ -46,11 +48,12 @@ inline __host__ __device__
     }
   };
 
-#ifndef __CUDA_ARCH__
-  war_nvbugs_881631::host_path(exec,dst,src);
-#else
-  war_nvbugs_881631::device_path(exec,dst,src);
-#endif // __CUDA_ARCH__
+  NV_IF_TARGET(NV_IS_HOST, (
+    war_nvbugs_881631::host_path(exec,dst,src);
+  ), (
+    war_nvbugs_881631::device_path(exec,dst,src);
+  ));
+
 } // end assign_value()
 
 
@@ -78,16 +81,14 @@ inline __host__ __device__
     }
   };
 
-#if __CUDA_ARCH__
-  war_nvbugs_881631::device_path(systems,dst,src);
-#else
-  war_nvbugs_881631::host_path(systems,dst,src);
-#endif
+  NV_IF_TARGET(NV_IS_HOST, (
+    war_nvbugs_881631::host_path(systems,dst,src);
+  ), (
+    war_nvbugs_881631::device_path(systems,dst,src);
+  ));
 } // end assign_value()
 
 
-
-  
 } // end cuda_cub
-THRUST_END_NS
+THRUST_NAMESPACE_END
 #endif
diff --git a/thrust/system/cuda/detail/async/copy.h b/thrust/system/cuda/detail/async/copy.h
index 8083fccd9..6f2970759 100644
--- a/thrust/system/cuda/detail/async/copy.h
+++ b/thrust/system/cuda/detail/async/copy.h
@@ -30,10 +30,9 @@
 #pragma once
 
 #include <thrust/detail/config.h>
-#include <thrust/detail/cpp11_required.h>
-#include <thrust/detail/modern_gcc_required.h>
+#include <thrust/detail/cpp14_required.h>
 
-#if THRUST_CPP_DIALECT >= 2011 && !defined(THRUST_LEGACY_GCC)
+#if THRUST_CPP_DIALECT >= 2014
 
 #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
 
@@ -54,7 +53,7 @@
 
 #include <type_traits>
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
 
 namespace system { namespace cuda { namespace detail
 {
@@ -66,7 +65,6 @@ template <
   typename FromPolicy, typename ToPolicy
 , typename ForwardIt, typename OutputIt, typename Size
 >
-THRUST_RUNTIME_FUNCTION
 auto async_copy_n(
   FromPolicy& from_exec
 , ToPolicy&   to_exec
@@ -149,7 +147,6 @@ template <
   typename FromPolicy, typename ToPolicy
 , typename ForwardIt, typename OutputIt, typename Size
 >
-THRUST_RUNTIME_FUNCTION
 auto async_copy_n(
   thrust::cuda::execution_policy<FromPolicy>& from_exec
 , thrust::cuda::execution_policy<ToPolicy>&   to_exec
@@ -193,7 +190,6 @@ template <
   typename FromPolicy, typename ToPolicy
 , typename ForwardIt, typename OutputIt, typename Size
 >
-THRUST_RUNTIME_FUNCTION
 auto async_copy_n(
   FromPolicy& from_exec
 , ToPolicy&   to_exec
@@ -228,6 +224,10 @@ auto async_copy_n(
 template <
   typename FromPolicy, typename ToPolicy
 , typename ForwardIt, typename OutputIt
+  // MSVC2015 WAR: doesn't like decltype(...)::value in superclass definition
+, typename IsH2DCopy = decltype(is_host_to_device_copy(
+    std::declval<FromPolicy const&>()
+  , std::declval<ToPolicy const&>()))
 >
 struct is_buffered_trivially_relocatable_host_to_device_copy
   : thrust::integral_constant<
@@ -238,12 +238,7 @@ struct is_buffered_trivially_relocatable_host_to_device_copy
             typename iterator_traits<ForwardIt>::value_type
           , typename iterator_traits<OutputIt>::value_type
           >::value
-      && decltype(
-           is_host_to_device_copy(
-             std::declval<FromPolicy const&>()
-           , std::declval<ToPolicy const&>()
-           )
-         )::value
+      && IsH2DCopy::value
     >
 {};
 
@@ -254,7 +249,6 @@ template <
   typename FromPolicy, typename ToPolicy
 , typename ForwardIt, typename OutputIt, typename Size
 >
-THRUST_RUNTIME_FUNCTION
 auto async_copy_n(
   FromPolicy&                               from_exec
 , thrust::cuda::execution_policy<ToPolicy>& to_exec
@@ -333,6 +327,10 @@ auto async_copy_n(
 template <
   typename FromPolicy, typename ToPolicy
 , typename ForwardIt, typename OutputIt
+  // MSVC2015 WAR: doesn't like decltype(...)::value in superclass definition
+, typename IsD2HCopy = decltype(is_device_to_host_copy(
+    std::declval<FromPolicy const&>()
+  , std::declval<ToPolicy const&>()))
 >
 struct is_buffered_trivially_relocatable_device_to_host_copy
   : thrust::integral_constant<
@@ -343,12 +341,7 @@ struct is_buffered_trivially_relocatable_device_to_host_copy
             typename iterator_traits<ForwardIt>::value_type
           , typename iterator_traits<OutputIt>::value_type
           >::value
-      && decltype(
-           is_device_to_host_copy(
-             std::declval<FromPolicy const&>()
-           , std::declval<ToPolicy const&>()
-           )
-         )::value
+      && IsD2HCopy::value
     >
 {};
 
@@ -359,7 +352,6 @@ template <
   typename FromPolicy, typename ToPolicy
 , typename ForwardIt, typename OutputIt, typename Size
 >
-THRUST_RUNTIME_FUNCTION
 auto async_copy_n(
   thrust::cuda::execution_policy<FromPolicy>& from_exec
 , ToPolicy&                                   to_exec
@@ -441,7 +433,6 @@ template <
   typename FromPolicy, typename ToPolicy
 , typename ForwardIt, typename OutputIt, typename Size
 >
-THRUST_RUNTIME_FUNCTION
 auto async_copy_n(
   FromPolicy& from_exec
 , ToPolicy&   to_exec
@@ -487,7 +478,6 @@ template <
   typename FromPolicy, typename ToPolicy
 , typename ForwardIt, typename Sentinel, typename OutputIt
 >
-THRUST_RUNTIME_FUNCTION
 auto async_copy(
   thrust::cuda::execution_policy<FromPolicy>&         from_exec
 , thrust::cpp::execution_policy<ToPolicy>&            to_exec
@@ -495,7 +485,7 @@ auto async_copy(
 , Sentinel                                            last
 , OutputIt                                            output
 )
-THRUST_DECLTYPE_RETURNS(
+THRUST_RETURNS(
   thrust::system::cuda::detail::async_copy_n(
     from_exec, to_exec, first, distance(first, last), output
   )
@@ -506,7 +496,6 @@ template <
   typename FromPolicy, typename ToPolicy
 , typename ForwardIt, typename Sentinel, typename OutputIt
 >
-THRUST_RUNTIME_FUNCTION
 auto async_copy(
   thrust::cpp::execution_policy<FromPolicy>& from_exec
 , thrust::cuda::execution_policy<ToPolicy>&  to_exec
@@ -514,7 +503,7 @@ auto async_copy(
 , Sentinel                                   last
 , OutputIt                                   output
 )
-THRUST_DECLTYPE_RETURNS(
+THRUST_RETURNS(
   thrust::system::cuda::detail::async_copy_n(
     from_exec, to_exec, first, distance(first, last), output
   )
@@ -525,7 +514,6 @@ template <
   typename FromPolicy, typename ToPolicy
 , typename ForwardIt, typename Sentinel, typename OutputIt
 >
-THRUST_RUNTIME_FUNCTION
 auto async_copy(
   thrust::cuda::execution_policy<FromPolicy>& from_exec
 , thrust::cuda::execution_policy<ToPolicy>&   to_exec
@@ -533,7 +521,7 @@ auto async_copy(
 , Sentinel                                    last
 , OutputIt                                    output
 )
-THRUST_DECLTYPE_RETURNS(
+THRUST_RETURNS(
   thrust::system::cuda::detail::async_copy_n(
     from_exec, to_exec, first, distance(first, last), output
   )
@@ -541,7 +529,7 @@ THRUST_DECLTYPE_RETURNS(
 
 } // cuda_cub
 
-THRUST_END_NS
+THRUST_NAMESPACE_END
 
 #endif // THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
 
diff --git a/thrust/system/cuda/detail/async/customization.h b/thrust/system/cuda/detail/async/customization.h
index 651eb287f..6f125a6f4 100644
--- a/thrust/system/cuda/detail/async/customization.h
+++ b/thrust/system/cuda/detail/async/customization.h
@@ -30,10 +30,9 @@
 #pragma once
 
 #include <thrust/detail/config.h>
-#include <thrust/detail/cpp11_required.h>
-#include <thrust/detail/modern_gcc_required.h>
+#include <thrust/detail/cpp14_required.h>
 
-#if THRUST_CPP_DIALECT >= 2011 && !defined(THRUST_LEGACY_GCC)
+#if THRUST_CPP_DIALECT >= 2014
 
 #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
 
@@ -43,13 +42,13 @@
 #include <thrust/detail/cstdint.h>
 #include <thrust/detail/execute_with_allocator.h>
 #include <thrust/system/cuda/memory_resource.h>
-#include <thrust/memory/detail/host_system_resource.h>
+#include <thrust/mr/host_memory_resource.h>
 #include <thrust/mr/allocator.h>
 #include <thrust/mr/disjoint_sync_pool.h>
 #include <thrust/mr/sync_pool.h>
 #include <thrust/per_device_resource.h>
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
 
 namespace system { namespace cuda { namespace detail
 {
@@ -63,7 +62,7 @@ template <typename DerivedPolicy>
 auto get_async_host_allocator(
   thrust::detail::execution_policy_base<DerivedPolicy>&
 )
-THRUST_DECLTYPE_RETURNS(
+THRUST_RETURNS(
   thrust::mr::stateless_resource_allocator<
     thrust::detail::uint8_t, default_async_host_resource
   >{}
@@ -81,7 +80,7 @@ template <typename DerivedPolicy>
 auto get_async_device_allocator(
   thrust::detail::execution_policy_base<DerivedPolicy>&
 )
-THRUST_DECLTYPE_RETURNS(
+THRUST_RETURNS(
   thrust::per_device_allocator<
     thrust::detail::uint8_t, default_async_device_resource, par_t
   >{}
@@ -91,7 +90,7 @@ template <typename Allocator, template <typename> class BaseSystem>
 auto get_async_device_allocator(
   thrust::detail::execute_with_allocator<Allocator, BaseSystem>& exec
 )
-THRUST_DECLTYPE_RETURNS(exec.get_allocator())
+THRUST_RETURNS(exec.get_allocator())
 
 template <typename Allocator, template <typename> class BaseSystem>
 auto get_async_device_allocator(
@@ -99,7 +98,7 @@ auto get_async_device_allocator(
     Allocator, BaseSystem
   >& exec
 )
-THRUST_DECLTYPE_RETURNS(exec.get_allocator())
+THRUST_RETURNS(exec.get_allocator())
 
 ///////////////////////////////////////////////////////////////////////////////
 
@@ -112,7 +111,7 @@ template <typename DerivedPolicy>
 auto get_async_universal_host_pinned_allocator(
   thrust::detail::execution_policy_base<DerivedPolicy>&
 )
-THRUST_DECLTYPE_RETURNS(
+THRUST_RETURNS(
   thrust::mr::stateless_resource_allocator<
     thrust::detail::uint8_t, default_async_universal_host_pinned_resource
   >{}
@@ -120,7 +119,7 @@ THRUST_DECLTYPE_RETURNS(
 
 }}} // namespace system::cuda::detail
 
-THRUST_END_NS
+THRUST_NAMESPACE_END
 
 #endif // THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
 
diff --git a/thrust/system/cuda/detail/async/exclusive_scan.h b/thrust/system/cuda/detail/async/exclusive_scan.h
new file mode 100644
index 000000000..0b120a434
--- /dev/null
+++ b/thrust/system/cuda/detail/async/exclusive_scan.h
@@ -0,0 +1,201 @@
+/******************************************************************************
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp14_required.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+
+#include <thrust/iterator/iterator_traits.h>
+
+#include <thrust/system/cuda/config.h>
+#include <thrust/system/cuda/detail/async/customization.h>
+#include <thrust/system/cuda/detail/util.h>
+#include <thrust/system/cuda/future.h>
+
+#include <thrust/type_traits/remove_cvref.h>
+
+#include <thrust/distance.h>
+
+#include <type_traits>
+
+// TODO specialize for thrust::plus to use e.g. ExclusiveSum instead of ExcScan
+//  - Note that thrust::plus<> is transparent, cub::Sum is not. This should be
+//    fixed in CUB first).
+//  - Need to check if CUB actually optimizes for sums before putting in effort
+
+THRUST_NAMESPACE_BEGIN
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+
+template <typename DerivedPolicy,
+          typename ForwardIt,
+          typename Size,
+          typename OutputIt,
+          typename InitialValueType,
+          typename BinaryOp>
+unique_eager_event
+async_exclusive_scan_n(execution_policy<DerivedPolicy>& policy,
+                       ForwardIt first,
+                       Size n,
+                       OutputIt out,
+                       InitialValueType init,
+                       BinaryOp op)
+{
+  using InputValueT = cub::detail::InputValue<InitialValueType>;
+  using Dispatch32 = cub::DispatchScan<ForwardIt,
+                                       OutputIt,
+                                       BinaryOp,
+                                       InputValueT,
+                                       thrust::detail::int32_t,
+                                       InitialValueType>;
+  using Dispatch64 = cub::DispatchScan<ForwardIt,
+                                       OutputIt,
+                                       BinaryOp,
+                                       InputValueT,
+                                       thrust::detail::int64_t,
+                                       InitialValueType>;
+
+  InputValueT init_value(init);
+
+  auto const device_alloc = get_async_device_allocator(policy);
+  unique_eager_event ev;
+
+  // Determine temporary device storage requirements.
+  cudaError_t status;
+  size_t tmp_size = 0;
+  {
+    THRUST_INDEX_TYPE_DISPATCH2(status,
+                                Dispatch32::Dispatch,
+                                Dispatch64::Dispatch,
+                                n,
+                                (nullptr,
+                                  tmp_size,
+                                  first,
+                                  out,
+                                  op,
+                                  init_value,
+                                  n_fixed,
+                                  nullptr));
+    thrust::cuda_cub::throw_on_error(status,
+                                     "after determining tmp storage "
+                                     "requirements for exclusive_scan");
+  }
+
+  // Allocate temporary storage.
+  auto content = uninitialized_allocate_unique_n<thrust::detail::uint8_t>(
+    device_alloc, tmp_size
+  );
+  void* const tmp_ptr = raw_pointer_cast(content.get());
+
+  // Set up stream with dependencies.
+  cudaStream_t const user_raw_stream = thrust::cuda_cub::stream(policy);
+
+  if (thrust::cuda_cub::default_stream() != user_raw_stream)
+  {
+    ev = make_dependent_event(
+      std::tuple_cat(
+        std::make_tuple(
+          std::move(content),
+          unique_stream(nonowning, user_raw_stream)
+        ),
+        extract_dependencies(std::move(thrust::detail::derived_cast(policy)))));
+  }
+  else
+  {
+    ev = make_dependent_event(
+      std::tuple_cat(
+        std::make_tuple(std::move(content)),
+        extract_dependencies(std::move(thrust::detail::derived_cast(policy)))));
+  }
+
+  // Run scan.
+  {
+    THRUST_INDEX_TYPE_DISPATCH2(status,
+                                Dispatch32::Dispatch,
+                                Dispatch64::Dispatch,
+                                n,
+                                (tmp_ptr,
+                                  tmp_size,
+                                  first,
+                                  out,
+                                  op,
+                                  init_value,
+                                  n_fixed,
+                                  user_raw_stream));
+    thrust::cuda_cub::throw_on_error(status,
+                                     "after dispatching exclusive_scan kernel");
+  }
+
+  return ev;
+}
+
+}}} // namespace system::cuda::detail
+
+namespace cuda_cub
+{
+
+// ADL entry point.
+template <typename DerivedPolicy,
+          typename ForwardIt,
+          typename Sentinel,
+          typename OutputIt,
+          typename InitialValueType,
+          typename BinaryOp>
+auto async_exclusive_scan(execution_policy<DerivedPolicy>& policy,
+                          ForwardIt first,
+                          Sentinel&& last,
+                          OutputIt&& out,
+                          InitialValueType &&init,
+                          BinaryOp&& op)
+THRUST_RETURNS(
+  thrust::system::cuda::detail::async_exclusive_scan_n(
+    policy,
+    first,
+    distance(first, THRUST_FWD(last)),
+    THRUST_FWD(out),
+    THRUST_FWD(init),
+    THRUST_FWD(op)
+  )
+)
+
+} // namespace cuda_cub
+
+THRUST_NAMESPACE_END
+
+#endif // THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+
+#endif // C++14
+
diff --git a/thrust/system/cuda/detail/async/for_each.h b/thrust/system/cuda/detail/async/for_each.h
index a6faf178f..d6809fe0a 100644
--- a/thrust/system/cuda/detail/async/for_each.h
+++ b/thrust/system/cuda/detail/async/for_each.h
@@ -31,10 +31,9 @@
 #pragma once
 
 #include <thrust/detail/config.h>
-#include <thrust/detail/cpp11_required.h>
-#include <thrust/detail/modern_gcc_required.h>
+#include <thrust/detail/cpp14_required.h>
 
-#if THRUST_CPP_DIALECT >= 2011 && !defined(THRUST_LEGACY_GCC)
+#if THRUST_CPP_DIALECT >= 2014
 
 #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
 
@@ -48,7 +47,7 @@
 
 #include <type_traits>
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
 
 namespace system { namespace cuda { namespace detail
 {
@@ -76,14 +75,12 @@ template <
   typename DerivedPolicy
 , typename ForwardIt, typename Size, typename UnaryFunction
 >
-THRUST_RUNTIME_FUNCTION
-auto async_for_each_n(
+unique_eager_event async_for_each_n(
   execution_policy<DerivedPolicy>& policy,
   ForwardIt                        first,
   Size                             n,
   UnaryFunction                    func
-) -> unique_eager_event
-{
+) {
   unique_eager_event e;
 
   // Set up stream with dependencies.
@@ -138,14 +135,13 @@ template <
   typename DerivedPolicy
 , typename ForwardIt, typename Sentinel, typename UnaryFunction
 >
-THRUST_RUNTIME_FUNCTION
 auto async_for_each(
   execution_policy<DerivedPolicy>& policy,
   ForwardIt                        first,
   Sentinel                         last,
   UnaryFunction&&                  func
 )
-THRUST_DECLTYPE_RETURNS(
+THRUST_RETURNS(
   thrust::system::cuda::detail::async_for_each_n(
     policy, first, distance(first, last), THRUST_FWD(func)
   )
@@ -153,7 +149,7 @@ THRUST_DECLTYPE_RETURNS(
 
 } // cuda_cub
 
-THRUST_END_NS
+THRUST_NAMESPACE_END
 
 #endif // THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
 
diff --git a/thrust/system/cuda/detail/async/inclusive_scan.h b/thrust/system/cuda/detail/async/inclusive_scan.h
new file mode 100644
index 000000000..363347c35
--- /dev/null
+++ b/thrust/system/cuda/detail/async/inclusive_scan.h
@@ -0,0 +1,194 @@
+/******************************************************************************
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp14_required.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+
+#include <thrust/iterator/iterator_traits.h>
+
+#include <thrust/system/cuda/config.h>
+#include <thrust/system/cuda/detail/async/customization.h>
+#include <thrust/system/cuda/detail/util.h>
+#include <thrust/system/cuda/future.h>
+
+#include <thrust/type_traits/remove_cvref.h>
+
+#include <thrust/distance.h>
+
+#include <type_traits>
+
+// TODO specialize for thrust::plus to use e.g. InclusiveSum instead of IncScan
+//  - Note that thrust::plus<> is transparent, cub::Sum is not. This should be
+//    fixed in CUB first).
+//  - Need to check if CUB actually optimizes for sums before putting in effort
+
+THRUST_NAMESPACE_BEGIN
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+
+template <typename DerivedPolicy,
+          typename ForwardIt,
+          typename Size,
+          typename OutputIt,
+          typename BinaryOp>
+unique_eager_event
+async_inclusive_scan_n(execution_policy<DerivedPolicy>& policy,
+                       ForwardIt first,
+                       Size n,
+                       OutputIt out,
+                       BinaryOp op)
+{
+  using AccumT = typename thrust::iterator_traits<ForwardIt>::value_type;
+  using Dispatch32 = cub::DispatchScan<ForwardIt,
+                                       OutputIt,
+                                       BinaryOp,
+                                       cub::NullType,
+                                       thrust::detail::int32_t,
+                                       AccumT>;
+  using Dispatch64 = cub::DispatchScan<ForwardIt,
+                                       OutputIt,
+                                       BinaryOp,
+                                       cub::NullType,
+                                       thrust::detail::int64_t,
+                                       AccumT>;
+
+  auto const device_alloc = get_async_device_allocator(policy);
+  unique_eager_event ev;
+
+  // Determine temporary device storage requirements.
+  cudaError_t status;
+  size_t tmp_size = 0;
+  {
+    THRUST_INDEX_TYPE_DISPATCH2(status,
+                                Dispatch32::Dispatch,
+                                Dispatch64::Dispatch,
+                                n,
+                                (nullptr,
+                                  tmp_size,
+                                  first,
+                                  out,
+                                  op,
+                                  cub::NullType{},
+                                  n_fixed,
+                                  nullptr));
+    thrust::cuda_cub::throw_on_error(status,
+                                     "after determining tmp storage "
+                                     "requirements for inclusive_scan");
+  }
+
+  // Allocate temporary storage.
+  auto content = uninitialized_allocate_unique_n<thrust::detail::uint8_t>(
+    device_alloc, tmp_size
+  );
+  void* const tmp_ptr = raw_pointer_cast(content.get());
+
+  // Set up stream with dependencies.
+  cudaStream_t const user_raw_stream = thrust::cuda_cub::stream(policy);
+
+  if (thrust::cuda_cub::default_stream() != user_raw_stream)
+  {
+    ev = make_dependent_event(
+      std::tuple_cat(
+        std::make_tuple(
+          std::move(content),
+          unique_stream(nonowning, user_raw_stream)
+        ),
+        extract_dependencies(std::move(thrust::detail::derived_cast(policy)))));
+  }
+  else
+  {
+    ev = make_dependent_event(
+      std::tuple_cat(
+        std::make_tuple(std::move(content)),
+        extract_dependencies(std::move(thrust::detail::derived_cast(policy)))));
+  }
+
+  // Run scan.
+  {
+    THRUST_INDEX_TYPE_DISPATCH2(status,
+                                Dispatch32::Dispatch,
+                                Dispatch64::Dispatch,
+                                n,
+                                (tmp_ptr,
+                                 tmp_size,
+                                 first,
+                                 out,
+                                 op,
+                                 cub::NullType{},
+                                 n_fixed,
+                                 user_raw_stream));
+    thrust::cuda_cub::throw_on_error(status,
+                                     "after dispatching inclusive_scan kernel");
+  }
+
+  return ev;
+}
+
+}}} // namespace system::cuda::detail
+
+namespace cuda_cub
+{
+
+// ADL entry point.
+template <typename DerivedPolicy,
+          typename ForwardIt,
+          typename Sentinel,
+          typename OutputIt,
+          typename BinaryOp>
+auto async_inclusive_scan(execution_policy<DerivedPolicy>& policy,
+                          ForwardIt first,
+                          Sentinel&& last,
+                          OutputIt&& out,
+                          BinaryOp&& op)
+THRUST_RETURNS(
+  thrust::system::cuda::detail::async_inclusive_scan_n(
+    policy,
+    first,
+    distance(first, THRUST_FWD(last)),
+    THRUST_FWD(out),
+    THRUST_FWD(op)
+  )
+)
+
+} // namespace cuda_cub
+
+THRUST_NAMESPACE_END
+
+#endif // THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+
+#endif // C++14
+
diff --git a/thrust/system/cuda/detail/async/reduce.h b/thrust/system/cuda/detail/async/reduce.h
index f2e000abc..2d0dbfe16 100644
--- a/thrust/system/cuda/detail/async/reduce.h
+++ b/thrust/system/cuda/detail/async/reduce.h
@@ -32,10 +32,9 @@
 #pragma once
 
 #include <thrust/detail/config.h>
-#include <thrust/detail/cpp11_required.h>
-#include <thrust/detail/modern_gcc_required.h>
+#include <thrust/detail/cpp14_required.h>
 
-#if THRUST_CPP_DIALECT >= 2011 && !defined(THRUST_LEGACY_GCC)
+#if THRUST_CPP_DIALECT >= 2014
 
 #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
 
@@ -50,7 +49,7 @@
 
 #include <type_traits>
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
 
 namespace system { namespace cuda { namespace detail
 {
@@ -59,22 +58,20 @@ template <
   typename DerivedPolicy
 , typename ForwardIt, typename Size, typename T, typename BinaryOp
 >
-THRUST_RUNTIME_FUNCTION
-auto async_reduce_n(
+unique_eager_future<remove_cvref_t<T>> async_reduce_n(
   execution_policy<DerivedPolicy>& policy
 , ForwardIt                        first
 , Size                             n
 , T                                init
 , BinaryOp                         op
-) -> unique_eager_future<remove_cvref_t<T>>
-{
+) {
   using U = remove_cvref_t<T>;
 
   auto const device_alloc = get_async_device_allocator(policy);
 
   using pointer
     = typename thrust::detail::allocator_traits<decltype(device_alloc)>::
-      rebind_traits<U>::pointer;
+      template rebind_traits<U>::pointer;
 
   unique_eager_future_promise_pair<U, pointer> fp;
 
@@ -82,7 +79,7 @@ auto async_reduce_n(
 
   size_t tmp_size = 0;
   thrust::cuda_cub::throw_on_error(
-    thrust::cuda_cub::cub::DeviceReduce::Reduce(
+    cub::DeviceReduce::Reduce(
       nullptr
     , tmp_size
     , first
@@ -91,7 +88,6 @@ auto async_reduce_n(
     , op
     , init
     , nullptr // Null stream, just for sizing.
-    , THRUST_DEBUG_SYNC_FLAG
     )
   , "after reduction sizing"
   );
@@ -164,7 +160,7 @@ auto async_reduce_n(
   // Run reduction.
 
   thrust::cuda_cub::throw_on_error(
-    thrust::cuda_cub::cub::DeviceReduce::Reduce(
+    cub::DeviceReduce::Reduce(
       tmp_ptr
     , tmp_size
     , first
@@ -173,7 +169,6 @@ auto async_reduce_n(
     , op
     , init
     , fp.future.stream().native_handle()
-    , THRUST_DEBUG_SYNC_FLAG
     )
   , "after reduction launch"
   );
@@ -191,7 +186,6 @@ template <
   typename DerivedPolicy
 , typename ForwardIt, typename Sentinel, typename T, typename BinaryOp
 >
-THRUST_RUNTIME_FUNCTION
 auto async_reduce(
   execution_policy<DerivedPolicy>& policy
 , ForwardIt                        first
@@ -199,7 +193,7 @@ auto async_reduce(
 , T                                init
 , BinaryOp                         op
 )
-THRUST_DECLTYPE_RETURNS(
+THRUST_RETURNS(
   thrust::system::cuda::detail::async_reduce_n(
     policy, first, distance(first, last), init, op
   )
@@ -217,16 +211,14 @@ template <
 , typename ForwardIt, typename Size, typename OutputIt
 , typename T, typename BinaryOp
 >
-THRUST_RUNTIME_FUNCTION
-auto async_reduce_into_n(
+unique_eager_event async_reduce_into_n(
   execution_policy<DerivedPolicy>& policy
 , ForwardIt                        first
 , Size                             n
 , OutputIt                         output
 , T                                init
 , BinaryOp                         op
-) -> unique_eager_event
-{
+) {
   using U = remove_cvref_t<T>;
 
   auto const device_alloc = get_async_device_allocator(policy);
@@ -237,7 +229,7 @@ auto async_reduce_into_n(
 
   size_t tmp_size = 0;
   thrust::cuda_cub::throw_on_error(
-    thrust::cuda_cub::cub::DeviceReduce::Reduce(
+    cub::DeviceReduce::Reduce(
       nullptr
     , tmp_size
     , first
@@ -246,7 +238,6 @@ auto async_reduce_into_n(
     , op
     , init
     , nullptr // Null stream, just for sizing.
-    , THRUST_DEBUG_SYNC_FLAG
     )
   , "after reduction sizing"
   );
@@ -301,7 +292,7 @@ auto async_reduce_into_n(
   // Run reduction.
 
   thrust::cuda_cub::throw_on_error(
-    thrust::cuda_cub::cub::DeviceReduce::Reduce(
+    cub::DeviceReduce::Reduce(
       tmp_ptr
     , tmp_size
     , first
@@ -310,7 +301,6 @@ auto async_reduce_into_n(
     , op
     , init
     , e.stream().native_handle()
-    , THRUST_DEBUG_SYNC_FLAG
     )
   , "after reduction launch"
   );
@@ -329,7 +319,6 @@ template <
 , typename ForwardIt, typename Sentinel, typename OutputIt
 , typename T, typename BinaryOp
 >
-THRUST_RUNTIME_FUNCTION
 auto async_reduce_into(
   execution_policy<DerivedPolicy>& policy
 , ForwardIt                        first
@@ -338,7 +327,7 @@ auto async_reduce_into(
 , T                                init
 , BinaryOp                         op
 )
-THRUST_DECLTYPE_RETURNS(
+THRUST_RETURNS(
   thrust::system::cuda::detail::async_reduce_into_n(
     policy, first, distance(first, last), output, init, op
   )
@@ -346,9 +335,9 @@ THRUST_DECLTYPE_RETURNS(
 
 } // cuda_cub
 
-THRUST_END_NS
+THRUST_NAMESPACE_END
 
 #endif // THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
 
-#endif 
+#endif
 
diff --git a/thrust/system/cuda/detail/cub/util_namespace.cuh b/thrust/system/cuda/detail/async/scan.h
similarity index 67%
rename from thrust/system/cuda/detail/cub/util_namespace.cuh
rename to thrust/system/cuda/detail/async/scan.h
index 0c2bf29fe..4a9f31681 100644
--- a/thrust/system/cuda/detail/cub/util_namespace.cuh
+++ b/thrust/system/cuda/detail/async/scan.h
@@ -1,6 +1,5 @@
 /******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -13,10 +12,10 @@
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
@@ -26,21 +25,9 @@
  *
  ******************************************************************************/
 
-/**
- * \file
- * Place-holder for prefixing the cub namespace
- */
-
 #pragma once
 
-// For example:
-//#define THRUST_CUB_NS_PREFIX namespace thrust{ namespace detail {
-//#define THRUST_CUB_NS_POSTFIX } }
-
-#ifndef THRUST_CUB_NS_PREFIX
-#define THRUST_CUB_NS_PREFIX
-#endif
+#include <thrust/detail/cpp14_required.h>
 
-#ifndef THRUST_CUB_NS_POSTFIX
-#define THRUST_CUB_NS_POSTFIX
-#endif
+#include <thrust/system/cuda/detail/async/exclusive_scan.h>
+#include <thrust/system/cuda/detail/async/inclusive_scan.h>
diff --git a/thrust/system/cuda/detail/async/sort.h b/thrust/system/cuda/detail/async/sort.h
index 10ca12d7c..f501f19c5 100644
--- a/thrust/system/cuda/detail/async/sort.h
+++ b/thrust/system/cuda/detail/async/sort.h
@@ -30,10 +30,9 @@
 #pragma once
 
 #include <thrust/detail/config.h>
-#include <thrust/detail/cpp11_required.h>
-#include <thrust/detail/modern_gcc_required.h>
+#include <thrust/detail/cpp14_required.h>
 
-#if THRUST_CPP_DIALECT >= 2011 && !defined(THRUST_LEGACY_GCC)
+#if THRUST_CPP_DIALECT >= 2014
 
 #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
 
@@ -54,7 +53,7 @@
 
 #include <type_traits>
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
 
 namespace system { namespace cuda { namespace detail
 {
@@ -64,7 +63,6 @@ template <
   typename DerivedPolicy
 , typename ForwardIt, typename Size, typename StrictWeakOrdering
 >
-THRUST_RUNTIME_FUNCTION
 auto async_stable_sort_n(
   execution_policy<DerivedPolicy>& policy,
   ForwardIt                        first,
@@ -87,7 +85,7 @@ auto async_stable_sort_n(
 
   auto const device_buffer_ptr = device_buffer.get();
 
-  // Synthesize a suitable new execution policy, because we don't want to 
+  // Synthesize a suitable new execution policy, because we don't want to
   // try and extract twice from the one we were passed.
   typename remove_cvref_t<decltype(policy)>::tag_type tag_policy{};
 
@@ -172,7 +170,6 @@ template <
   typename DerivedPolicy
 , typename ForwardIt, typename Size, typename StrictWeakOrdering
 >
-THRUST_RUNTIME_FUNCTION
 auto async_stable_sort_n(
   execution_policy<DerivedPolicy>& policy,
   ForwardIt                        first,
@@ -214,7 +211,6 @@ auto async_stable_sort_n(
     , n
     , comp
     , nullptr // Null stream, just for sizing.
-    , THRUST_DEBUG_SYNC_FLAG
     )
   , "after merge sort sizing"
   );
@@ -279,7 +275,6 @@ auto async_stable_sort_n(
     , n
     , comp
     , e.stream().native_handle()
-    , THRUST_DEBUG_SYNC_FLAG
     )
   , "after merge sort sizing"
   );
@@ -288,21 +283,20 @@ auto async_stable_sort_n(
 }
 
 template <typename T, typename Size, typename StrictWeakOrdering>
-THRUST_RUNTIME_FUNCTION
 typename std::enable_if<
   is_operator_less_function_object<StrictWeakOrdering>::value
 , cudaError_t
 >::type
 invoke_radix_sort(
-  cudaStream_t                            stream
-, void*                                   tmp_ptr
-, std::size_t&                            tmp_size
-, thrust::cuda_cub::cub::DoubleBuffer<T>& keys
-, Size&                                   n
+  cudaStream_t          stream
+, void*                 tmp_ptr
+, std::size_t&          tmp_size
+, cub::DoubleBuffer<T>& keys
+, Size&                 n
 , StrictWeakOrdering
 )
 {
-  return thrust::cuda_cub::cub::DeviceRadixSort::SortKeys(
+  return cub::DeviceRadixSort::SortKeys(
     tmp_ptr
   , tmp_size
   , keys
@@ -310,26 +304,24 @@ invoke_radix_sort(
   , 0
   , sizeof(T) * 8
   , stream
-  , THRUST_DEBUG_SYNC_FLAG
   );
 }
 
 template <typename T, typename Size, typename StrictWeakOrdering>
-THRUST_RUNTIME_FUNCTION
 typename std::enable_if<
   is_operator_greater_function_object<StrictWeakOrdering>::value
 , cudaError_t
 >::type
 invoke_radix_sort(
-  cudaStream_t                            stream
-, void*                                   tmp_ptr
-, std::size_t&                            tmp_size
-, thrust::cuda_cub::cub::DoubleBuffer<T>& keys
-, Size&                                   n
+  cudaStream_t          stream
+, void*                 tmp_ptr
+, std::size_t&          tmp_size
+, cub::DoubleBuffer<T>& keys
+, Size&                 n
 , StrictWeakOrdering
 )
 {
-  return thrust::cuda_cub::cub::DeviceRadixSort::SortKeysDescending(
+  return cub::DeviceRadixSort::SortKeysDescending(
     tmp_ptr
   , tmp_size
   , keys
@@ -337,7 +329,6 @@ invoke_radix_sort(
   , 0
   , sizeof(T) * 8
   , stream
-  , THRUST_DEBUG_SYNC_FLAG
   );
 }
 
@@ -348,7 +339,6 @@ template <
   typename DerivedPolicy
 , typename ForwardIt, typename Size, typename StrictWeakOrdering
 >
-THRUST_RUNTIME_FUNCTION
 auto async_stable_sort_n(
   execution_policy<DerivedPolicy>& policy
 , ForwardIt                        first
@@ -372,7 +362,7 @@ auto async_stable_sort_n(
 
   unique_eager_event e;
 
-  thrust::cuda_cub::cub::DoubleBuffer<T> keys(
+  cub::DoubleBuffer<T> keys(
     raw_pointer_cast(&*first), nullptr
   );
 
@@ -476,7 +466,7 @@ auto async_stable_sort_n(
       )>::value
     ));
 
-    // Synthesize a suitable new execution policy, because we don't want to 
+    // Synthesize a suitable new execution policy, because we don't want to
     // try and extract twice from the one we were passed.
     typename remove_cvref_t<decltype(policy)>::tag_type tag_policy{};
 
@@ -503,13 +493,14 @@ template <
   typename DerivedPolicy
 , typename ForwardIt, typename Sentinel, typename StrictWeakOrdering
 >
-THRUST_RUNTIME_FUNCTION
 auto async_stable_sort(
   execution_policy<DerivedPolicy>& policy,
   ForwardIt                        first,
   Sentinel                         last,
   StrictWeakOrdering               comp
 )
+// A GCC 5 bug requires an explicit trailing return type here, so stick with
+// THRUST_DECLTYPE_RETURNS for now.
 THRUST_DECLTYPE_RETURNS(
   thrust::system::cuda::detail::async_stable_sort_n(
     policy, first, distance(first, last), comp
@@ -518,7 +509,7 @@ THRUST_DECLTYPE_RETURNS(
 
 } // cuda_cub
 
-THRUST_END_NS
+THRUST_NAMESPACE_END
 
 #endif // THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
 
diff --git a/thrust/system/cuda/detail/async/transform.h b/thrust/system/cuda/detail/async/transform.h
index 55cc1997b..a971300f2 100644
--- a/thrust/system/cuda/detail/async/transform.h
+++ b/thrust/system/cuda/detail/async/transform.h
@@ -30,10 +30,9 @@
 #pragma once
 
 #include <thrust/detail/config.h>
-#include <thrust/detail/cpp11_required.h>
-#include <thrust/detail/modern_gcc_required.h>
+#include <thrust/detail/cpp14_required.h>
 
-#if THRUST_CPP_DIALECT >= 2011 && !defined(THRUST_LEGACY_GCC)
+#if THRUST_CPP_DIALECT >= 2014
 
 #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
 
@@ -48,7 +47,7 @@
 
 #include <type_traits>
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
 
 namespace system { namespace cuda { namespace detail
 {
@@ -77,15 +76,13 @@ template <
   typename DerivedPolicy
 , typename ForwardIt, typename Size, typename OutputIt, typename UnaryOperation
 >
-THRUST_RUNTIME_FUNCTION
-auto async_transform_n(
+unique_eager_event async_transform_n(
   execution_policy<DerivedPolicy>& policy,
   ForwardIt                        first,
   Size                             n,
   OutputIt                         output,
   UnaryOperation                   op
-) -> unique_eager_event
-{
+) {
   unique_eager_event e;
 
   // Set up stream with dependencies.
@@ -141,7 +138,6 @@ template <
 , typename ForwardIt, typename Sentinel, typename OutputIt
 , typename UnaryOperation
 >
-THRUST_RUNTIME_FUNCTION
 auto async_transform(
   execution_policy<DerivedPolicy>& policy,
   ForwardIt                        first,
@@ -149,7 +145,7 @@ auto async_transform(
   OutputIt                         output,
   UnaryOperation&&                 op
 )
-THRUST_DECLTYPE_RETURNS(
+THRUST_RETURNS(
   thrust::system::cuda::detail::async_transform_n(
     policy, first, distance(first, last), output, THRUST_FWD(op)
   )
@@ -157,7 +153,7 @@ THRUST_DECLTYPE_RETURNS(
 
 } // cuda_cub
 
-THRUST_END_NS
+THRUST_NAMESPACE_END
 
 #endif // THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
 
diff --git a/thrust/system/cuda/detail/binary_search.h b/thrust/system/cuda/detail/binary_search.h
index d42ac1a0f..fb769a4ac 100644
--- a/thrust/system/cuda/detail/binary_search.h
+++ b/thrust/system/cuda/detail/binary_search.h
@@ -1,784 +1,19 @@
-/******************************************************************************
- * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-#pragma once
-
-#if 0
-
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
-#include <thrust/detail/cstdint.h>
-#include <thrust/detail/temporary_array.h>
-#include <thrust/system/cuda/detail/util.h>
-
-#include <thrust/system/cuda/execution_policy.h>
-#include <thrust/system/cuda/detail/core/agent_launcher.h>
-#include <thrust/system/cuda/detail/core/util.h>
-#include <thrust/system/cuda/detail/par_to_seq.h>
-#include <thrust/binary_search.h>
-#include <thrust/distance.h>
-
-#if 1
-#  define BS_SIMPLE
-#endif
-
-THRUST_BEGIN_NS
-namespace cuda_cub {
-
-namespace __binary_search {
-
-  template <class HaystackIt, class NeedlesIt>
-  struct lbf
-  {
-    typedef typename iterator_traits<HaystackIt>::difference_type result_type;
-    typedef typename iterator_traits<NeedlesIt>::value_type T;
-
-    template <class It, class CompareOp>
-    THRUST_DEVICE_FUNCTION result_type
-    operator()(It begin, It end, T const& value, CompareOp comp)
-    {
-      return system::detail::generic::scalar::lower_bound(begin,
-                                                          end,
-                                                          value,
-                                                          comp) -
-             begin;
-    }
-  };    // struct lbf
-
-  template<class HaystackIt, class NeedlesIt>
-  struct ubf
-  {
-    typedef typename iterator_traits<HaystackIt>::difference_type result_type;
-    typedef typename iterator_traits<NeedlesIt>::value_type T;
-
-    template <class It, class CompareOp>
-    THRUST_DEVICE_FUNCTION result_type
-    operator()(It begin, It end, T const& value, CompareOp comp)
-    {
-      return system::detail::generic::scalar::upper_bound(begin,
-                                                          end,
-                                                          value,
-                                                          comp) -
-             begin;
-    }
-  };    // struct ubf
-
-  template<class HaystackIt, class NeedlesIt>
-  struct bsf
-  {
-    typedef bool result_type;
-    typedef typename iterator_traits<NeedlesIt>::value_type T;
-
-    template <class It, class CompareOp>
-    THRUST_DEVICE_FUNCTION bool 
-    operator()(It begin, It end, T const& value, CompareOp comp)
-    {
-      HaystackIt iter = system::detail::generic::scalar::lower_bound(begin,
-                                                                     end,
-                                                                     value,
-                                                                     comp);
-
-      detail::wrapped_function<CompareOp, bool> wrapped_comp(comp);
-
-      return iter != end && !wrapped_comp(value, *iter);
-    }
-  };    // struct bsf
-
-  template <class KeysIt1,
-            class KeysIt2,
-            class Size,
-            class BinaryPred>
-  THRUST_DEVICE_FUNCTION Size 
-  merge_path(KeysIt1    keys1,
-             KeysIt2    keys2,
-             Size       keys1_count,
-             Size       keys2_count,
-             Size       diag,
-             BinaryPred binary_pred)
-  {
-    typedef typename iterator_traits<KeysIt1>::value_type key1_type;
-    typedef typename iterator_traits<KeysIt2>::value_type key2_type;
-
-    Size keys1_begin = thrust::max<Size>(0, diag - keys2_count);
-    Size keys1_end   = thrust::min<Size>(diag, keys1_count);
-
-    while (keys1_begin < keys1_end)
-    {
-      Size      mid  = (keys1_begin + keys1_end) >> 1;
-      key1_type key1 = keys1[mid];
-      key2_type key2 = keys2[diag - 1 - mid];
-      bool      pred = binary_pred(key2, key1);
-      if (pred)
-      {
-        keys1_end = mid;
-      }
-      else
-      {
-        keys1_begin = mid + 1;
-      }
-    }
-    return keys1_begin;
-  }
-
-  template <class It, class T2, class CompareOp, int ITEMS_PER_THREAD>
-  THRUST_DEVICE_FUNCTION void 
-  serial_merge(It  keys_shared,
-               int keys1_beg,
-               int keys2_beg,
-               int keys1_count,
-               int keys2_count,
-               T2 (&output)[ITEMS_PER_THREAD],
-               int (&indices)[ITEMS_PER_THREAD],
-               CompareOp compare_op)
-  {
-    int keys1_end = keys1_beg + keys1_count;
-    int keys2_end = keys2_beg + keys2_count;
-    
-    typedef typename iterator_value<It>::type key_type;
-
-    key_type key1 = keys_shared[keys1_beg];
-    key_type key2 = keys_shared[keys2_beg];
-
-
-#pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-    {
-      bool p = (keys2_beg < keys2_end) &&
-               ((keys1_beg >= keys1_end) ||
-                compare_op(key2,key1));
-
-      output[ITEM]  = p ? key2 : key1;
-      indices[ITEM] = p ? keys2_beg++ : keys1_beg++;
-
-      if (p)
-      {
-        key2 = keys_shared[keys2_beg];
-      }
-      else
-      {
-        key1 = keys_shared[keys1_beg];
-      }
-    }
-  }
-
-  template <int                      _BLOCK_THREADS,
-            int                      _ITEMS_PER_THREAD = 1,
-            int                      _MIN_BLOCKS       = 1,
-            cub::BlockLoadAlgorithm  _LOAD_ALGORITHM   = cub::BLOCK_LOAD_DIRECT,
-            cub::CacheLoadModifier   _LOAD_MODIFIER    = cub::LOAD_LDG,
-            cub::BlockStoreAlgorithm _STORE_ALGORITHM  = cub::BLOCK_STORE_DIRECT>
-  struct PtxPolicy
-  {
-    enum
-    {
-      BLOCK_THREADS      = _BLOCK_THREADS,
-      ITEMS_PER_THREAD   = _ITEMS_PER_THREAD,
-      MIN_BLOCKS         = _MIN_BLOCKS,
-      ITEMS_PER_TILE     = _BLOCK_THREADS * _ITEMS_PER_THREAD,
-    };
-
-    static const cub::BlockLoadAlgorithm  LOAD_ALGORITHM  = _LOAD_ALGORITHM;
-    static const cub::CacheLoadModifier   LOAD_MODIFIER   = _LOAD_MODIFIER;
-    static const cub::BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM;
-  };    // PtxPolicy
-  
-  template <class Arch, class T>
-  struct Tuning;
-
-  template<class T>  
-  struct Tuning<sm30,T>
-  {
-    enum
-    {
-      NOMINAL_4B_ITEMS_PER_THREAD = 7,
-      ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(3, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
-    };
-
-    typedef PtxPolicy<128,
-                      ITEMS_PER_THREAD,
-                      1,
-                      cub::BLOCK_LOAD_WARP_TRANSPOSE,
-                      cub::LOAD_LDG,
-                      cub::BLOCK_STORE_TRANSPOSE>
-        type;
-  };
-  
-  template<class T>
-  struct Tuning<sm52,T>
-  {
-    const static int INPUT_SIZE = sizeof(T);
-
-    enum
-    {
-      NOMINAL_4B_ITEMS_PER_THREAD = 7,
-      ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
-    };
-
-    typedef PtxPolicy<128,
-                      ITEMS_PER_THREAD,
-                      1,
-                      cub::BLOCK_LOAD_WARP_TRANSPOSE,
-                      cub::LOAD_LDG,
-                      cub::BLOCK_STORE_WARP_TRANSPOSE>
-        type;
-  };
-  
-  template <class NeedlesIt,
-            class HaystackIt,
-            class Size,
-            class OutputIt,
-            class CompareOp,
-            class SearchOp>
-  struct VectorizedBinarySearchAgent
-  {
-    typedef typename iterator_traits<NeedlesIt>::value_type  needle_type;
-    typedef typename iterator_traits<HaystackIt>::value_type haystack_type;
-    typedef typename SearchOp::result_type                   result_type;
-
-    template <class Arch>
-    struct PtxPlan : Tuning<Arch, needle_type>::type
-    {
-      typedef Tuning<Arch,needle_type> tuning;
-
-      typedef typename core::LoadIterator<PtxPlan, NeedlesIt>::type  NeedlesLoadIt;
-      typedef typename core::LoadIterator<PtxPlan, HaystackIt>::type HaystackLoadIt;
-
-      typedef typename core::BlockLoad<PtxPlan, NeedlesLoadIt>::type BlockLoadNeedles;
-
-      typedef typename core::BlockStore<PtxPlan, OutputIt, result_type>::type BlockStoreResult;
-
-      union TempStorage
-      {
-        typename BlockLoadNeedles::TempStorage load_needles;
-        typename BlockStoreResult::TempStorage store_result;
-
-#ifndef BS_SIMPLE
-        core::uninitialized_array<needle_type, PtxPlan::ITEMS_PER_TILE + 1> needles_shared;
-        core::uninitialized_array<result_type, PtxPlan::ITEMS_PER_TILE>     result_shared;
-        core::uninitialized_array<int, PtxPlan::ITEMS_PER_TILE>             indices_shared;
-#endif
-      };    // union TempStorage
-    };
-
-    typedef typename core::specialize_plan_msvc10_war<PtxPlan>::type::type ptx_plan;
-
-    typedef typename ptx_plan::NeedlesLoadIt    NeedlesLoadIt;
-    typedef typename ptx_plan::HaystackLoadIt   HaystackLoadIt;
-    typedef typename ptx_plan::BlockLoadNeedles BlockLoadNeedles;
-    typedef typename ptx_plan::BlockStoreResult BlockStoreResult;
-    typedef typename ptx_plan::TempStorage     TempStorage;
-
-    enum
-    {
-      ITEMS_PER_THREAD = ptx_plan::ITEMS_PER_THREAD,
-      BLOCK_THREADS    = ptx_plan::BLOCK_THREADS,
-      ITEMS_PER_TILE   = ptx_plan::ITEMS_PER_TILE
-    };
-
-    struct impl
-    {
-      TempStorage&   storage;
-      NeedlesLoadIt  needles_load_it;
-      HaystackLoadIt haystack_load_it;
-      Size           needles_count;
-      Size           haystack_size;
-      OutputIt       result;
-      CompareOp      compare_op;
-      SearchOp       search_op;
-
-      THRUST_DEVICE_FUNCTION
-      void stable_odd_even_sort(needle_type (&needles)[ITEMS_PER_THREAD],
-                                int (&indices)[ITEMS_PER_THREAD])
-      {
-#pragma unroll
-        for (int I = 0; I < ITEMS_PER_THREAD; ++I)
-        {
-#pragma unroll
-          for (int J = 1 & I; J < ITEMS_PER_THREAD - 1; J += 2)
-          {
-            if (compare_op(needles[J + 1], needles[J]))
-            {
-              using thrust::swap;
-              swap(needles[J], needles[J + 1]);
-              swap(indices[J], indices[J + 1]);
-            }
-          }    // inner loop
-        }      // outer loop
-      }
-
-      THRUST_DEVICE_FUNCTION void
-      block_mergesort(int tid,
-                      int count,
-                      needle_type (&needles_loc)[ITEMS_PER_THREAD],
-                      int (&indices_loc)[ITEMS_PER_THREAD])
-      {
-        using core::sync_threadblock;
-
-        // stable sort items in a single thread
-        //
-        stable_odd_even_sort(needles_loc,indices_loc);
-
-        // each thread has  sorted keys_loc
-        // merge sort keys_loc in shared memory
-        //
-#pragma unroll
-        for (int coop = 2; coop <= BLOCK_THREADS; coop *= 2)
-        {
-          sync_threadblock();
-
-          // store keys in shmem
-          //
-#pragma unroll
-          for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-          {
-            int idx = ITEMS_PER_THREAD * threadIdx.x + ITEM;
-            storage.needles_shared[idx] = needles_loc[ITEM];
-          }
-
-          sync_threadblock();
-
-          int  indices[ITEMS_PER_THREAD];
-
-          int list  = ~(coop - 1) & tid;
-          int start = ITEMS_PER_THREAD * list;
-          int size  = ITEMS_PER_THREAD * (coop >> 1);
-
-          int diag = min(count, ITEMS_PER_THREAD * ((coop - 1) & tid));
-
-          int keys1_beg = min(count, start);
-          int keys1_end = min(count, keys1_beg + size);
-          int keys2_beg = keys1_end;
-          int keys2_end = min(count, keys2_beg + size);
+/*
+*  Copyright 2021 NVIDIA Corporation
+*
+*  Licensed under the Apache License, Version 2.0 (the "License");
+*  you may not use this file except in compliance with the License.
+*  You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+*  Unless required by applicable law or agreed to in writing, software
+*  distributed under the License is distributed on an "AS IS" BASIS,
+*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*  See the License for the specific language governing permissions and
+*  limitations under the License.
+*/
 
-          int keys1_count = keys1_end - keys1_beg;
-          int keys2_count = keys2_end - keys2_beg;
-
-          int partition_diag = merge_path(&storage.needles_shared[keys1_beg],
-                                          &storage.needles_shared[keys2_beg],
-                                          keys1_count,
-                                          keys2_count,
-                                          diag,
-                                          compare_op);
-
-          int keys1_beg_loc   = keys1_beg + partition_diag;
-          int keys1_end_loc   = keys1_end;
-          int keys2_beg_loc   = keys2_beg + diag - partition_diag;
-          int keys2_end_loc   = keys2_end;
-          int keys1_count_loc = keys1_end_loc - keys1_beg_loc;
-          int keys2_count_loc = keys2_end_loc - keys2_beg_loc;
-          serial_merge(&storage.needles_shared[0],
-                       keys1_beg_loc,
-                       keys2_beg_loc,
-                       keys1_count_loc,
-                       keys2_count_loc,
-                       needles_loc,
-                       indices,
-                       compare_op);
-
-
-          sync_threadblock();
-
-#pragma unroll
-          for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-          {
-            int idx = ITEMS_PER_THREAD * threadIdx.x + ITEM;
-            storage.indices_shared[idx] = indices_loc[ITEM];
-          }
-
-          sync_threadblock();
-
-#pragma unroll
-          for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-          {
-            indices_loc[ITEM] = storage.indices_shared[indices[ITEM]];
-          }
-        }
-      }    // func block_merge_sort
-
-      template <bool IS_LAST_TILE>
-      THRUST_DEVICE_FUNCTION void
-      consume_tile(int  tid,
-                   Size tile_idx,
-                   Size tile_base,
-                   int  num_remaining)
-      {
-        using core::sync_threadblock;
-
-        needle_type needles_loc[ITEMS_PER_THREAD];
-        BlockLoadNeedles(storage.load_needles)
-            .Load(needles_load_it + tile_base, needles_loc, num_remaining);
-       
-#ifdef BS_SIMPLE
-
-        result_type results_loc[ITEMS_PER_THREAD];
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-          results_loc[ITEM] = search_op(haystack_load_it,
-                                        haystack_load_it + haystack_size,
-                                        needles_loc[ITEM],
-                                        compare_op);
-        }
-
-
-#else
-
-        if (IS_LAST_TILE)
-        {
-          needle_type max_value = needles_loc[0];
-#pragma unroll
-          for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM)
-          {
-            if (ITEMS_PER_THREAD * tid + ITEM < num_remaining)
-            {
-              max_value = compare_op(max_value, needles_loc[ITEM])
-                            ? needles_loc[ITEM]
-                            : max_value;
-            }
-            else
-            {
-              needles_loc[ITEM] = max_value;
-            }
-          }
-        }
-
-        sync_threadblock();
-
-        int indices_loc[ITEMS_PER_THREAD];
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-          int idx = ITEMS_PER_THREAD*threadIdx.x + ITEM;
-          indices_loc[ITEM] = idx;
-        }
-
-        if (IS_LAST_TILE)
-        {
-          block_mergesort(tid,
-                          num_remaining,
-                          needles_loc,
-                          indices_loc);
-        }
-        else
-        {
-          block_mergesort(tid,
-                          ITEMS_PER_TILE,
-                          needles_loc,
-                          indices_loc);
-        }
-
-        sync_threadblock();
-
-#pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-          int idx = indices_loc[ITEM];
-          storage.result_shared[idx] =
-              search_op(haystack_load_it,
-                        haystack_load_it + haystack_size,
-                        needles_loc[ITEM],
-                        compare_op);
-        }
-        
-        sync_threadblock();
-
-        result_type results_loc[ITEMS_PER_THREAD];
-#pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-          int idx = ITEMS_PER_THREAD*threadIdx.x + ITEM;
-          results_loc[ITEM] = storage.result_shared[idx];
-        }
-
-        sync_threadblock();
-#endif
-
-        BlockStoreResult(storage.store_result)
-            .Store(result + tile_base, results_loc, num_remaining);
-      }
-
-      THRUST_DEVICE_FUNCTION
-      impl(TempStorage& storage_,
-           NeedlesIt    needles_it_,
-           HaystackIt   haystack_it_,
-           Size         needles_count_,
-           Size         haystack_size_,
-           OutputIt     result_,
-           CompareOp    compare_op_,
-           SearchOp     search_op_)
-          : storage(storage_),
-            needles_load_it(core::make_load_iterator(ptx_plan(), needles_it_)),
-            haystack_load_it(core::make_load_iterator(ptx_plan(), haystack_it_)),
-            needles_count(needles_count_),
-            haystack_size(haystack_size_),
-            result(result_),
-            compare_op(compare_op_),
-            search_op(search_op_)
-      {
-        int  tid           = threadIdx.x;
-        Size tile_idx      = blockIdx.x;
-        Size num_tiles     = gridDim.x;
-        Size tile_base     = tile_idx * ITEMS_PER_TILE;
-        int  items_in_tile = min<int>(needles_count - tile_base, ITEMS_PER_TILE);
-        if (tile_idx < num_tiles - 1)
-        {
-          consume_tile<false>(tid, tile_idx, tile_base, ITEMS_PER_TILE);
-        }
-        else
-        {
-          consume_tile<true>(tid, tile_idx, tile_base, items_in_tile);
-        }
-      }
-    };    // struct impl
-
-
-    THRUST_AGENT_ENTRY(NeedlesIt  needles_it,
-                       HaystackIt haystack_it,
-                       Size       needles_count,
-                       Size       haystack_size,
-                       OutputIt   result,
-                       CompareOp  compare_op,
-                       SearchOp   search_op,
-                       char*      shmem)
-    {
-      TempStorage& storage = *reinterpret_cast<TempStorage*>(shmem);
-
-      impl(storage,
-           needles_it,
-           haystack_it,
-           needles_count,
-           haystack_size,
-           result,
-           compare_op,
-           search_op);
-    }
-  };    // struct VectorizedBinarySearchAgent
-
-  template <class NeedlesIt,
-            class HaystackIt,
-            class Size,
-            class OutputIt,
-            class CompareOp,
-            class SearchOp>
-  cudaError_t THRUST_RUNTIME_FUNCTION
-  doit_pass(void*        d_temp_storage,
-            size_t&      temp_storage_size,
-            NeedlesIt    needles_it,
-            HaystackIt   haystack_it,
-            Size         needles_count,
-            Size         haystack_size,
-            OutputIt     result,
-            CompareOp    compare_op,
-            SearchOp     search_op,
-            cudaStream_t stream,
-            bool         debug_sync)
-  {
-    if (needles_count == 0)
-      return cudaErrorNotSupported;
-
-    cudaError_t status = cudaSuccess;
-
-    using core::AgentPlan;
-    using core::AgentLauncher;
-
-
-    typedef AgentLauncher<
-        VectorizedBinarySearchAgent<NeedlesIt,
-                                    HaystackIt,
-                                    Size,
-                                    OutputIt,
-                                    CompareOp,
-                                    SearchOp> >
-        search_agent;
-
-    AgentPlan search_plan = search_agent::get_plan(stream);
-
-    temp_storage_size = 1;
-    if (d_temp_storage == NULL)
-    {
-      return status;
-    }
-
-    search_agent sa(search_plan, needles_count, stream, "binary_search::search_agent", debug_sync);
-    sa.launch(needles_it,
-              haystack_it,
-              needles_count,
-              haystack_size,
-              result,
-              compare_op,
-              search_op);
-    
-    CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
-
-    return status;
-  }
-
-  template <typename Derived,
-            typename NeedlesIt,
-            typename HaystackIt,
-            typename OutputIt,
-            typename CompareOp,
-            typename SearchOp>
-  OutputIt THRUST_RUNTIME_FUNCTION
-  doit(execution_policy<Derived>& policy,
-       HaystackIt                 haystack_begin,
-       HaystackIt                 haystack_end,
-       NeedlesIt                  needles_begin,
-       NeedlesIt                  needles_end,
-       OutputIt                   result,
-       CompareOp                  compare_op,
-       SearchOp                   search_op)
-  {
-    typedef typename iterator_traits<NeedlesIt>::difference_type size_type;
-
-    size_type needles_count = thrust::distance(needles_begin, needles_end);
-    size_type haystack_size = thrust::distance(haystack_begin, haystack_end);
-
-    if (needles_count == 0)
-      return result;
-
-    size_t       storage_size = 0;
-    cudaStream_t stream       = cuda_cub::stream(policy);
-    bool         debug_sync   = THRUST_DEBUG_SYNC_FLAG;
-
-    cudaError status;
-    status = doit_pass(NULL,
-                       storage_size,
-                       needles_begin,
-                       haystack_begin,
-                       needles_count,
-                       haystack_size,
-                       result,
-                       compare_op,
-                       search_op,
-                       stream,
-                       debug_sync);
-    cuda_cub::throw_on_error(status, "binary_search: failed on 1st call");
-
-    // Allocate temporary storage.
-    thrust::detail::temporary_array<thrust::detail::uint8_t, Derived>
-      tmp(policy, storage_size);
-    void *ptr = static_cast<void*>(tmp.data().get());
-
-    status = doit_pass(ptr,
-                       storage_size,
-                       needles_begin,
-                       haystack_begin,
-                       needles_count,
-                       haystack_size,
-                       result,
-                       compare_op,
-                       search_op,
-                       stream,
-                       debug_sync);
-    cuda_cub::throw_on_error(status, "binary_search: failed on 2nt call");
-    
-    status = cuda_cub::synchronize(policy);
-    cuda_cub::throw_on_error(status, "binary_search: failed to synchronize");
-
-    return result + needles_count;
-  }
-
-  struct less
-  {
-    template <typename T1, typename T2>
-    THRUST_DEVICE_FUNCTION bool
-    operator()(const T1& lhs, const T2& rhs) const
-    {
-      return lhs < rhs;
-    }
-  };
-}    // namespace __binary_search
-
-//-------------------------
-// Thrust API entry points
-//-------------------------
-
-__thrust_exec_check_disable__
-template <class Derived,
-          class HaystackIt,
-          class NeedlesIt,
-          class OutputIt,
-          class CompareOp>
-OutputIt __host__ __device__
-lower_bound(execution_policy<Derived>& policy,
-            HaystackIt                 first,
-            HaystackIt                 last,
-            NeedlesIt                  values_first,
-            NeedlesIt                  values_last,
-            OutputIt                   result,
-            CompareOp                  compare_op)
-{
-  OutputIt ret = result;
-  if (__THRUST_HAS_CUDART__)
-  {
-    ret = __binary_search::doit(policy,
-                                first,
-                                last,
-                                values_first,
-                                values_last,
-                                result,
-                                compare_op,
-                                __binary_search::lbf<HaystackIt, NeedlesIt>());
-  }
-  else
-  {
-#if !__THRUST_HAS_CUDART__
-    ret = thrust::lower_bound(cvt_to_seq(derived_cast(policy)),
-                              first,
-                              last,
-                              values_first,
-                              values_last,
-                              result);
-#endif
-  }
-  return ret;
-}
-
-
-template <class Derived,
-          class HaystackIt,
-          class NeedlesIt,
-          class OutputIt>
-OutputIt __host__ __device__
-lower_bound(execution_policy<Derived>& policy,
-            HaystackIt                 first,
-            HaystackIt                 last,
-            NeedlesIt                  values_first,
-            NeedlesIt                  values_last,
-            OutputIt                   result)
-{
-  return cuda_cub::lower_bound(policy,
-                               first,
-                               last,
-                               values_first,
-                               values_last,
-                               result,
-                               __binary_search::less());
-}
-
-}    // namespace cuda_cub
-THRUST_END_NS
-#endif
+#pragma once
 
-#endif
+// this system has no special version of this algorithm
diff --git a/thrust/system/cuda/detail/cdp_dispatch.h b/thrust/system/cuda/detail/cdp_dispatch.h
new file mode 100644
index 000000000..f94e5dd92
--- /dev/null
+++ b/thrust/system/cuda/detail/cdp_dispatch.h
@@ -0,0 +1,88 @@
+/*
+*  Copyright 2021-2022 NVIDIA Corporation
+*
+*  Licensed under the Apache License, Version 2.0 (the "License");
+*  you may not use this file except in compliance with the License.
+*  You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+*  Unless required by applicable law or agreed to in writing, software
+*  distributed under the License is distributed on an "AS IS" BASIS,
+*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*  See the License for the specific language governing permissions and
+*  limitations under the License.
+*/
+
+/**
+ * \file
+ * Utilities for CUDA dynamic parallelism.
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+#include <cub/detail/detect_cuda_runtime.cuh>
+
+#include <nv/target>
+
+/**
+ * \def THRUST_CDP_DISPATCH
+ *
+ * If CUDA Dynamic Parallelism / CUDA Nested Parallelism is available, always
+ * run the parallel implementation. Otherwise, run the parallel implementation
+ * when called from the host, and fallback to the sequential implementation on
+ * the device.
+ *
+ * `par_impl` and `seq_impl` are blocks of C++ statements enclosed in
+ * parentheses, similar to NV_IF_TARGET blocks:
+ *
+ * \code
+ * THRUST_CDP_DISPATCH((launch_parallel_kernel();), (run_serial_impl();));
+ * \endcode
+ */
+
+#if defined(CUB_DETAIL_CDPv1)
+
+// Special case for NVCC -- need to inform the device path about the kernels
+// that are launched from the host path.
+#if defined(__CUDACC__) && defined(__CUDA_ARCH__)
+
+// seq_impl only used on platforms that do not support device synchronization.
+#define THRUST_CDP_DISPATCH(par_impl, seq_impl)                                \
+  if (false)                                                                   \
+  { /* Without this, the device pass won't compile any kernels. */             \
+    NV_IF_TARGET(NV_ANY_TARGET, par_impl);                                     \
+  }                                                                            \
+  NV_IF_TARGET(NV_PROVIDES_SM_90, seq_impl, par_impl)
+
+#else // NVCC device pass
+
+// seq_impl only used on platforms that do not support device synchronization.
+#define THRUST_CDP_DISPATCH(par_impl, seq_impl)                                \
+  NV_IF_TARGET(NV_PROVIDES_SM_90, seq_impl, par_impl)
+
+#endif // NVCC device pass
+
+#else // CDPv1 unavailable. Always fallback to serial on device:
+
+// Special case for NVCC -- need to inform the device path about the kernels
+// that are launched from the host path.
+#if defined(__CUDACC__) && defined(__CUDA_ARCH__)
+
+// Device-side launch not supported, fallback to sequential in device code.
+#define THRUST_CDP_DISPATCH(par_impl, seq_impl)                                \
+  if (false)                                                                   \
+  { /* Without this, the device pass won't compile any kernels. */             \
+    NV_IF_TARGET(NV_ANY_TARGET, par_impl);                                     \
+  }                                                                            \
+  NV_IF_TARGET(NV_IS_HOST, par_impl, seq_impl)
+
+#else // !(NVCC device pass):
+
+#define THRUST_CDP_DISPATCH(par_impl, seq_impl)                                \
+  NV_IF_TARGET(NV_IS_HOST, par_impl, seq_impl)
+
+#endif // NVCC device pass
+
+#endif // CDP version
diff --git a/thrust/system/cuda/detail/copy.h b/thrust/system/cuda/detail/copy.h
index 15dd00b41..02a5d2ac1 100644
--- a/thrust/system/cuda/detail/copy.h
+++ b/thrust/system/cuda/detail/copy.h
@@ -26,12 +26,16 @@
  ******************************************************************************/
 #pragma once
 
+#include <thrust/detail/config.h>
+
+#include <thrust/advance.h>
 
 #include <thrust/system/cuda/config.h>
+#include <thrust/system/cuda/detail/cdp_dispatch.h>
 #include <thrust/system/cuda/detail/execution_policy.h>
 #include <thrust/system/cuda/detail/cross_system.h>
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
 
 template <typename DerivedPolicy, typename InputIt, typename OutputIt>
 __host__ __device__ OutputIt
@@ -91,7 +95,7 @@ copy_n(cross_system<System1, System2> systems,
        OutputIterator result);
 
 }    // namespace cuda_
-THRUST_END_NS
+THRUST_NAMESPACE_END
 
 
 
@@ -99,7 +103,7 @@ THRUST_END_NS
 #include <thrust/system/cuda/detail/internal/copy_cross_system.h>
 #include <thrust/system/cuda/detail/par_to_seq.h>
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
 namespace cuda_cub {
 
 
@@ -116,22 +120,11 @@ copy(execution_policy<System> &system,
      InputIterator             last,
      OutputIterator            result)
 {
-  OutputIterator ret = result;
-  if (__THRUST_HAS_CUDART__)
-  {
-    ret = __copy::device_to_device(system, first, last, result);
-  }
-  else
-  {
-#if !__THRUST_HAS_CUDART__
-    ret = thrust::copy(cvt_to_seq(derived_cast(system)),
-                       first,
-                       last,
-                       result);
-#endif
-  }
-
-  return ret;
+  THRUST_CDP_DISPATCH(
+    (result = __copy::device_to_device(system, first, last, result);),
+    (result =
+       thrust::copy(cvt_to_seq(derived_cast(system)), first, last, result);));
+  return result;
 }    // end copy()
 
 __thrust_exec_check_disable__
@@ -145,19 +138,14 @@ copy_n(execution_policy<System> &system,
        Size                      n,
        OutputIterator            result)
 {
-  OutputIterator ret = result;
-  if (__THRUST_HAS_CUDART__)
-  {
-    ret = __copy::device_to_device(system, first, first + n, result);
-  }
-  else
-  {
-#if !__THRUST_HAS_CUDART__
-    ret = thrust::copy_n(cvt_to_seq(derived_cast(system)), first, n, result);
-#endif
-  }
-
-  return ret;
+  THRUST_CDP_DISPATCH(
+    (result = __copy::device_to_device(system,
+                                       first,
+                                       thrust::next(first, n),
+                                       result);),
+    (result =
+       thrust::copy_n(cvt_to_seq(derived_cast(system)), first, n, result);));
+  return result;
 } // end copy_n()
 #endif
 
@@ -190,7 +178,7 @@ copy_n(cross_system<System1, System2> systems,
 
 
 }    // namespace cuda_cub
-THRUST_END_NS
+THRUST_NAMESPACE_END
 
 #include <thrust/memory.h>
 #include <thrust/detail/temporary_array.h>
diff --git a/thrust/system/cuda/detail/copy_if.h b/thrust/system/cuda/detail/copy_if.h
index 2ee870225..5e760c086 100644
--- a/thrust/system/cuda/detail/copy_if.h
+++ b/thrust/system/cuda/detail/copy_if.h
@@ -26,22 +26,26 @@
  ******************************************************************************/
 #pragma once
 
+#include <thrust/detail/config.h>
 
 #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
-#include <thrust/system/cuda/config.h>
 
+#include <thrust/detail/alignment.h>
 #include <thrust/detail/cstdint.h>
+#include <thrust/detail/function.h>
 #include <thrust/detail/temporary_array.h>
-#include <thrust/system/cuda/detail/util.h>
-#include <thrust/system/cuda/detail/cub/device/device_select.cuh>
+#include <thrust/distance.h>
+#include <thrust/system/cuda/config.h>
+#include <thrust/system/cuda/detail/cdp_dispatch.h>
 #include <thrust/system/cuda/detail/core/agent_launcher.h>
 #include <thrust/system/cuda/detail/core/util.h>
 #include <thrust/system/cuda/detail/par_to_seq.h>
-#include <thrust/detail/function.h>
-#include <thrust/distance.h>
-#include <thrust/detail/alignment.h>
+#include <thrust/system/cuda/detail/util.h>
 
-THRUST_BEGIN_NS
+#include <cub/device/device_select.cuh>
+#include <cub/util_math.cuh>
+
+THRUST_NAMESPACE_BEGIN
 // XXX declare generic copy_if interface
 // to avoid circulular dependency from thrust/copy.h
 template <typename DerivedPolicy, typename InputIterator, typename OutputIterator, typename Predicate>
@@ -69,7 +73,6 @@ namespace __copy_if {
 
   template <int                     _BLOCK_THREADS,
             int                     _ITEMS_PER_THREAD = 1,
-            int                     _MIN_BLOCKS       = 1,
             cub::BlockLoadAlgorithm _LOAD_ALGORITHM   = cub::BLOCK_LOAD_DIRECT,
             cub::CacheLoadModifier  _LOAD_MODIFIER    = cub::LOAD_LDG,
             cub::BlockScanAlgorithm _SCAN_ALGORITHM   = cub::BLOCK_SCAN_WARP_SCANS>
@@ -79,7 +82,6 @@ namespace __copy_if {
     {
       BLOCK_THREADS      = _BLOCK_THREADS,
       ITEMS_PER_THREAD   = _ITEMS_PER_THREAD,
-      MIN_BLOCKS         = _MIN_BLOCKS,
       ITEMS_PER_TILE     = _BLOCK_THREADS * _ITEMS_PER_THREAD,
     };
     static const cub::BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM;
@@ -89,7 +91,7 @@ namespace __copy_if {
 
   template<class, class>
   struct Tuning;
-  
+
   template<class T>
   struct Tuning<sm52, T>
   {
@@ -103,13 +105,12 @@ namespace __copy_if {
 
     typedef PtxPolicy<128,
                       ITEMS_PER_THREAD,
-                      1,
                       cub::BLOCK_LOAD_WARP_TRANSPOSE,
                       cub::LOAD_LDG,
                       cub::BLOCK_SCAN_WARP_SCANS>
         type;
   };    // Tuning<350>
-  
+
 
   template<class T>
   struct Tuning<sm35, T>
@@ -124,13 +125,12 @@ namespace __copy_if {
 
     typedef PtxPolicy<128,
                       ITEMS_PER_THREAD,
-                      1,
                       cub::BLOCK_LOAD_WARP_TRANSPOSE,
                       cub::LOAD_LDG,
                       cub::BLOCK_SCAN_WARP_SCANS>
         type;
   };    // Tuning<350>
-  
+
   template<class T>
   struct Tuning<sm30, T>
   {
@@ -144,13 +144,12 @@ namespace __copy_if {
 
     typedef PtxPolicy<128,
                       ITEMS_PER_THREAD,
-                      1,
                       cub::BLOCK_LOAD_WARP_TRANSPOSE,
                       cub::LOAD_DEFAULT,
                       cub::BLOCK_SCAN_WARP_SCANS>
         type;
   };    // Tuning<300>
-  
+
   struct no_stencil_tag_    {};
   typedef no_stencil_tag_* no_stencil_tag;
   template <class ItemsIt,
@@ -194,11 +193,11 @@ namespace __copy_if {
 
       union TempStorage
       {
-        struct
+        struct ScanStorage
         {
           typename BlockScan::TempStorage          scan;
           typename TilePrefixCallback::TempStorage prefix;
-        };
+        } scan_storage;
 
         typename BlockLoadItems::TempStorage   load_items;
         typename BlockLoadStencil::TempStorage load_stencil;
@@ -206,7 +205,7 @@ namespace __copy_if {
         core::uninitialized_array<item_type, PtxPlan::ITEMS_PER_TILE> raw_exchange;
       };    // union TempStorage
     };    // struct PtxPlan
-    
+
     typedef typename core::specialize_plan_msvc10_war<PtxPlan>::type::type ptx_plan;
 
     typedef typename ptx_plan::ItemsLoadIt        ItemsLoadIt;
@@ -224,7 +223,7 @@ namespace __copy_if {
       ITEMS_PER_THREAD = ptx_plan::ITEMS_PER_THREAD,
       ITEMS_PER_TILE   = ptx_plan::ITEMS_PER_TILE
     };
-    
+
     struct impl
     {
       //---------------------------------------------------------------------
@@ -238,7 +237,7 @@ namespace __copy_if {
       OutputIt       output_it;
       Predicate      predicate;
       Size           num_items;
-      
+
       //------------------------------------------
       // scatter results to memory
       //------------------------------------------
@@ -259,7 +258,7 @@ namespace __copy_if {
                                      num_selections_prefix;
           if (selection_flags[ITEM])
           {
-            storage.raw_exchange[local_scatter_offset] = items[ITEM];
+            new (&storage.raw_exchange[local_scatter_offset]) item_type(items[ITEM]);
           }
         }
 
@@ -272,7 +271,7 @@ namespace __copy_if {
           output_it[num_selections_prefix + item] = storage.raw_exchange[item];
         }
       }    // func scatter
-      
+
       //------------------------------------------
       // specialize predicate on different types
       //------------------------------------------
@@ -357,11 +356,11 @@ namespace __copy_if {
           }
         }
       }
-      
+
       //------------------------------------------
       // consume tiles
       //------------------------------------------
-      
+
       template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
       Size THRUST_DEVICE_FUNCTION
       consume_tile_impl(int  num_tile_items,
@@ -423,7 +422,7 @@ namespace __copy_if {
         Size num_selections_prefix = 0;
         if (IS_FIRST_TILE)
         {
-          BlockScan(storage.scan)
+          BlockScan(storage.scan_storage.scan)
               .ExclusiveSum(selection_flags,
                             selection_idx,
                             num_tile_selections);
@@ -446,10 +445,10 @@ namespace __copy_if {
         else
         {
           TilePrefixCallback prefix_cb(tile_state,
-                                       storage.prefix,
+                                       storage.scan_storage.prefix,
                                        cub::Sum(),
                                        tile_idx);
-          BlockScan(storage.scan)
+          BlockScan(storage.scan_storage.scan)
               .ExclusiveSum(selection_flags,
                             selection_idx,
                             prefix_cb);
@@ -501,7 +500,7 @@ namespace __copy_if {
       //---------------------------------------------------------------------
       // Constructor
       //---------------------------------------------------------------------
-      
+
       THRUST_DEVICE_FUNCTION impl(TempStorage &       storage_,
                                   ScanTileState &     tile_state_,
                                   ItemsIt             items_it,
@@ -578,7 +577,7 @@ namespace __copy_if {
     template <class Arch>
     struct PtxPlan : PtxPolicy<128> {};
     typedef core::specialize_plan<PtxPlan> ptx_plan;
-    
+
     //---------------------------------------------------------------------
     // Agent entry point
     //---------------------------------------------------------------------
@@ -600,17 +599,16 @@ namespace __copy_if {
             class Predicate,
             class Size,
             class NumSelectedOutIt>
-  static cudaError_t THRUST_RUNTIME_FUNCTION
-  doit_step(void *           d_temp_storage,
-            size_t &         temp_storage_bytes,
-            ItemsIt          items,
-            StencilIt        stencil,
-            OutputIt         output_it,
-            Predicate        predicate,
-            NumSelectedOutIt num_selected_out,
-            Size             num_items,
-            cudaStream_t     stream,
-            bool             debug_sync)
+  THRUST_RUNTIME_FUNCTION
+  static cudaError_t doit_step(void *           d_temp_storage,
+                               size_t &         temp_storage_bytes,
+                               ItemsIt          items,
+                               StencilIt        stencil,
+                               OutputIt         output_it,
+                               Predicate        predicate,
+                               NumSelectedOutIt num_selected_out,
+                               Size             num_items,
+                               cudaStream_t     stream)
   {
     if (num_items == 0)
       return cudaSuccess;
@@ -640,7 +638,7 @@ namespace __copy_if {
     typename get_plan<copy_if_agent>::type copy_if_plan = copy_if_agent::get_plan(stream);
 
     int tile_size = copy_if_plan.items_per_tile;
-    size_t num_tiles = (num_items + tile_size - 1) / tile_size;
+    size_t num_tiles = cub::DivideAndRoundUp(num_items, tile_size);
 
     size_t vshmem_size = core::vshmem_size(copy_if_plan.shared_memory_size,
                                            num_tiles);
@@ -648,11 +646,11 @@ namespace __copy_if {
     cudaError_t status = cudaSuccess;
     if (num_items == 0)
       return status;
-    
+
     size_t allocation_sizes[2] = {0, vshmem_size};
     status = ScanTileState::AllocationSize(static_cast<int>(num_tiles), allocation_sizes[0]);
     CUDA_CUB_RET_IF_FAIL(status);
-    
+
 
     void* allocations[2] = {NULL, NULL};
     status = cub::AliasTemporaries(d_temp_storage,
@@ -660,7 +658,7 @@ namespace __copy_if {
                                    allocations,
                                    allocation_sizes);
     CUDA_CUB_RET_IF_FAIL(status);
-    
+
 
     if (d_temp_storage == NULL)
     {
@@ -671,11 +669,11 @@ namespace __copy_if {
     status = tile_status.Init(static_cast<int>(num_tiles), allocations[0], allocation_sizes[0]);
     CUDA_CUB_RET_IF_FAIL(status);
 
-    init_agent ia(init_plan, num_tiles, stream, "copy_if::init_agent", debug_sync);
+    init_agent ia(init_plan, num_tiles, stream, "copy_if::init_agent");
 
     char *vshmem_ptr = vshmem_size > 0 ? (char*)allocations[1] : NULL;
 
-    copy_if_agent pa(copy_if_plan, num_items, stream, vshmem_ptr, "copy_if::partition_agent", debug_sync);
+    copy_if_agent pa(copy_if_plan, num_items, stream, vshmem_ptr, "copy_if::partition_agent");
 
     ia.launch(tile_status, num_tiles, num_selected_out);
     CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
@@ -710,7 +708,6 @@ namespace __copy_if {
     size_type    num_items          = static_cast<size_type>(thrust::distance(first, last));
     size_t       temp_storage_bytes = 0;
     cudaStream_t stream             = cuda_cub::stream(policy);
-    bool         debug_sync         = THRUST_DEBUG_SYNC_FLAG;
 
     if (num_items == 0)
       return output;
@@ -724,8 +721,7 @@ namespace __copy_if {
                        predicate,
                        reinterpret_cast<size_type*>(NULL),
                        num_items,
-                       stream,
-                       debug_sync);
+                       stream);
     cuda_cub::throw_on_error(status, "copy_if failed on 1st step");
 
     size_t allocation_sizes[2] = {sizeof(size_type), temp_storage_bytes};
@@ -761,8 +757,7 @@ namespace __copy_if {
                        predicate,
                        d_num_selected_out,
                        num_items,
-                       stream,
-                       debug_sync);
+                       stream);
     cuda_cub::throw_on_error(status, "copy_if failed on 2nd step");
 
     status = cuda_cub::synchronize(policy);
@@ -791,28 +786,18 @@ copy_if(execution_policy<Derived> &policy,
         OutputIterator             result,
         Predicate                  pred)
 {
-  OutputIterator ret = result;
-
-  if (__THRUST_HAS_CUDART__)
-  {
-    ret = __copy_if::copy_if(policy,
-                             first,
-                             last,
-                             __copy_if::no_stencil_tag(),
-                             result,
-                             pred);
-  }
-  else
-  {
-#if !__THRUST_HAS_CUDART__
-    ret = thrust::copy_if(cvt_to_seq(derived_cast(policy)),
-                          first,
-                          last,
-                          result,
-                          pred);
-#endif
-  }
-  return ret;
+  THRUST_CDP_DISPATCH((return __copy_if::copy_if(policy,
+                                                   first,
+                                                   last,
+                                                   __copy_if::no_stencil_tag(),
+                                                   result,
+                                                   pred);),
+                      (return
+                         thrust::copy_if(cvt_to_seq(derived_cast(policy)),
+                                         first,
+                                         last,
+                                         result,
+                                         pred);));
 } // func copy_if
 
 __thrust_exec_check_disable__
@@ -829,33 +814,18 @@ copy_if(execution_policy<Derived> &policy,
         OutputIterator             result,
         Predicate                  pred)
 {
-  OutputIterator ret = result;
-
-  if (__THRUST_HAS_CUDART__)
-  {
-    ret = __copy_if::copy_if(policy,
-                             first,
-                             last,
-                             stencil,
-                             result,
-                             pred);
-  }
-  else
-  {
-#if !__THRUST_HAS_CUDART__
-    ret = thrust::copy_if(cvt_to_seq(derived_cast(policy)),
-                          first,
-                          last,
-                          stencil,
-                          result,
-                          pred);
-#endif
-  }
-  return ret;
+  THRUST_CDP_DISPATCH(
+    (return __copy_if::copy_if(policy, first, last, stencil, result, pred);),
+    (return thrust::copy_if(cvt_to_seq(derived_cast(policy)),
+                              first,
+                              last,
+                              stencil,
+                              result,
+                              pred);));
 }    // func copy_if
 
 }    // namespace cuda_cub
-THRUST_END_NS
+THRUST_NAMESPACE_END
 
 #include <thrust/copy.h>
 #endif
diff --git a/thrust/system/cuda/detail/core/agent_launcher.h b/thrust/system/cuda/detail/core/agent_launcher.h
index 0ed414e58..dbb26f33f 100644
--- a/thrust/system/cuda/detail/core/agent_launcher.h
+++ b/thrust/system/cuda/detail/core/agent_launcher.h
@@ -26,32 +26,28 @@
  ******************************************************************************/
 #pragma once
 
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
 #include <thrust/detail/config.h>
+
+#include <cub/detail/device_synchronize.cuh>
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
 #include <thrust/system/cuda/detail/guarded_cuda_runtime_api.h>
 #include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
 #include <thrust/system/cuda/detail/core/util.h>
 #include <cassert>
 
-#if 0
-#define __THRUST__TEMPLATE_DEBUG
-#endif
-
-#if __THRUST__TEMPLATE_DEBUG
-template<int...> class ID_impl;
-template<int... I> class Foo { ID_impl<I...> t;};
-#endif
+#include <nv/target>
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
 namespace cuda_cub {
 namespace core {
 
 
-#ifdef __CUDA_ARCH__
+#if defined(__CUDA_ARCH__) || defined(_NVHPC_CUDA)
 #if 0
   template <class Agent, class... Args>
-  void __global__ 
-  __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS,Agent::ptx_plan::MIN_BLOCKS)
+  void __global__
+  __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
       _kernel_agent(Args... args)
   {
     extern __shared__ char shmem[];
@@ -59,119 +55,119 @@ namespace core {
   }
 #else
   template <class Agent, class _0>
-  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS, Agent::ptx_plan::MIN_BLOCKS)
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
   _kernel_agent(_0 x0)
   {
     extern __shared__ char shmem[];
     Agent::entry(x0, shmem);
   }
   template <class Agent, class _0, class _1>
-  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS, Agent::ptx_plan::MIN_BLOCKS)
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
   _kernel_agent(_0 x0, _1 x1)
   {
     extern __shared__ char shmem[];
     Agent::entry(x0, x1, shmem);
   }
   template <class Agent, class _0, class _1, class _2>
-  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS, Agent::ptx_plan::MIN_BLOCKS)
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
   _kernel_agent(_0 x0, _1 x1, _2 x2)
   {
     extern __shared__ char shmem[];
     Agent::entry(x0, x1, x2, shmem);
   }
   template <class Agent, class _0, class _1, class _2, class _3>
-  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS, Agent::ptx_plan::MIN_BLOCKS)
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
   _kernel_agent(_0 x0, _1 x1, _2 x2, _3 x3)
   {
     extern __shared__ char shmem[];
     Agent::entry(x0, x1, x2, x3, shmem);
   }
   template <class Agent, class _0, class _1, class _2, class _3, class _4>
-  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS, Agent::ptx_plan::MIN_BLOCKS)
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
   _kernel_agent(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4)
   {
     extern __shared__ char shmem[];
     Agent::entry(x0, x1, x2, x3, x4, shmem);
   }
   template <class Agent, class _0, class _1, class _2, class _3, class _4, class _5>
-  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS, Agent::ptx_plan::MIN_BLOCKS)
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
   _kernel_agent(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5)
   {
     extern __shared__ char shmem[];
     Agent::entry(x0, x1, x2, x3, x4, x5, shmem);
   }
   template <class Agent, class _0, class _1, class _2, class _3, class _4, class _5, class _6>
-  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS, Agent::ptx_plan::MIN_BLOCKS)
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
   _kernel_agent(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6)
   {
     extern __shared__ char shmem[];
     Agent::entry(x0, x1, x2, x3, x4, x5, x6, shmem);
   }
   template <class Agent, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7>
-  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS, Agent::ptx_plan::MIN_BLOCKS)
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
   _kernel_agent(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7)
   {
     extern __shared__ char shmem[];
     Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, shmem);
   }
   template <class Agent, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8>
-  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS, Agent::ptx_plan::MIN_BLOCKS)
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
   _kernel_agent(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8)
   {
     extern __shared__ char shmem[];
     Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, x8, shmem);
   }
   template <class Agent, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9>
-  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS, Agent::ptx_plan::MIN_BLOCKS)
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
   _kernel_agent(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9)
   {
     extern __shared__ char shmem[];
     Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, shmem);
   }
   template <class Agent, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA>
-  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS, Agent::ptx_plan::MIN_BLOCKS)
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
   _kernel_agent(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA)
   {
     extern __shared__ char shmem[];
     Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, shmem);
   }
   template <class Agent, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB>
-  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS, Agent::ptx_plan::MIN_BLOCKS)
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
   _kernel_agent(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB)
   {
     extern __shared__ char shmem[];
     Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, shmem);
   }
   template <class Agent, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC>
-  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS, Agent::ptx_plan::MIN_BLOCKS)
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
   _kernel_agent(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC)
   {
     extern __shared__ char shmem[];
     Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, shmem);
   }
   template <class Agent, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC, class _xD>
-  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS, Agent::ptx_plan::MIN_BLOCKS)
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
   _kernel_agent(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC, _xD xD)
   {
     extern __shared__ char shmem[];
     Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, xD, shmem);
   }
   template <class Agent, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC, class _xD, class _xE>
-  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS, Agent::ptx_plan::MIN_BLOCKS)
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
   _kernel_agent(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC, _xD xD, _xE xE)
   {
     extern __shared__ char shmem[];
     Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, xD, xE, shmem);
   }
 #endif
-  
+
   ////////////////////////////////////////////////////////////
 
 
 #if 0
   template <class Agent, class... Args>
-  void __global__ 
-  __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS,Agent::ptx_plan::MIN_BLOCKS)
+  void __global__
+  __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
       _kernel_agent_vshmem(char* vshmem, Args... args)
   {
     extern __shared__ char shmem[];
@@ -180,7 +176,7 @@ namespace core {
   }
 #else
   template <class Agent, class _0>
-  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS, Agent::ptx_plan::MIN_BLOCKS)
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
   _kernel_agent_vshmem(char* vshmem, _0 x0)
   {
     extern __shared__ char shmem[];
@@ -188,7 +184,7 @@ namespace core {
     Agent::entry(x0, vshmem);
   }
   template <class Agent, class _0, class _1>
-  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS, Agent::ptx_plan::MIN_BLOCKS)
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
   _kernel_agent_vshmem(char* vshmem, _0 x0, _1 x1)
   {
     extern __shared__ char shmem[];
@@ -196,7 +192,7 @@ namespace core {
     Agent::entry(x0, x1, vshmem);
   }
   template <class Agent, class _0, class _1, class _2>
-  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS, Agent::ptx_plan::MIN_BLOCKS)
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
   _kernel_agent_vshmem(char* vshmem, _0 x0, _1 x1, _2 x2)
   {
     extern __shared__ char shmem[];
@@ -204,7 +200,7 @@ namespace core {
     Agent::entry(x0, x1, x2, vshmem);
   }
   template <class Agent, class _0, class _1, class _2, class _3>
-  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS, Agent::ptx_plan::MIN_BLOCKS)
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
   _kernel_agent_vshmem(char* vshmem, _0 x0, _1 x1, _2 x2, _3 x3)
   {
     extern __shared__ char shmem[];
@@ -212,7 +208,7 @@ namespace core {
     Agent::entry(x0, x1, x2, x3, vshmem);
   }
   template <class Agent, class _0, class _1, class _2, class _3, class _4>
-  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS, Agent::ptx_plan::MIN_BLOCKS)
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
   _kernel_agent_vshmem(char* vshmem, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4)
   {
     extern __shared__ char shmem[];
@@ -220,7 +216,7 @@ namespace core {
     Agent::entry(x0, x1, x2, x3, x4, vshmem);
   }
   template <class Agent, class _0, class _1, class _2, class _3, class _4, class _5>
-  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS, Agent::ptx_plan::MIN_BLOCKS)
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
   _kernel_agent_vshmem(char* vshmem, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5)
   {
     extern __shared__ char shmem[];
@@ -228,7 +224,7 @@ namespace core {
     Agent::entry(x0, x1, x2, x3, x4, x5, vshmem);
   }
   template <class Agent, class _0, class _1, class _2, class _3, class _4, class _5, class _6>
-  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS, Agent::ptx_plan::MIN_BLOCKS)
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
   _kernel_agent_vshmem(char* vshmem, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6)
   {
     extern __shared__ char shmem[];
@@ -236,7 +232,7 @@ namespace core {
     Agent::entry(x0, x1, x2, x3, x4, x5, x6, vshmem);
   }
   template <class Agent, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7>
-  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS, Agent::ptx_plan::MIN_BLOCKS)
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
   _kernel_agent_vshmem(char* vshmem, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7)
   {
     extern __shared__ char shmem[];
@@ -244,7 +240,7 @@ namespace core {
     Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, vshmem);
   }
   template <class Agent, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8>
-  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS, Agent::ptx_plan::MIN_BLOCKS)
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
   _kernel_agent_vshmem(char* vshmem, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8)
   {
     extern __shared__ char shmem[];
@@ -252,7 +248,7 @@ namespace core {
     Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, x8, vshmem);
   }
   template <class Agent, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9>
-  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS, Agent::ptx_plan::MIN_BLOCKS)
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
   _kernel_agent_vshmem(char* vshmem, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9)
   {
     extern __shared__ char shmem[];
@@ -260,7 +256,7 @@ namespace core {
     Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, vshmem);
   }
   template <class Agent, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA>
-  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS, Agent::ptx_plan::MIN_BLOCKS)
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
   _kernel_agent_vshmem(char* vshmem, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA)
   {
     extern __shared__ char shmem[];
@@ -268,7 +264,7 @@ namespace core {
     Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, vshmem);
   }
   template <class Agent, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB>
-  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS, Agent::ptx_plan::MIN_BLOCKS)
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
   _kernel_agent_vshmem(char* vshmem, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB)
   {
     extern __shared__ char shmem[];
@@ -276,7 +272,7 @@ namespace core {
     Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, vshmem);
   }
   template <class Agent, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC>
-  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS, Agent::ptx_plan::MIN_BLOCKS)
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
   _kernel_agent_vshmem(char* vshmem, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC)
   {
     extern __shared__ char shmem[];
@@ -284,7 +280,7 @@ namespace core {
     Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, vshmem);
   }
   template <class Agent, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC, class _xD>
-  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS, Agent::ptx_plan::MIN_BLOCKS)
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
   _kernel_agent_vshmem(char* vshmem, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC, _xD xD)
   {
     extern __shared__ char shmem[];
@@ -292,7 +288,7 @@ namespace core {
     Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, xD, vshmem);
   }
   template <class Agent, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC, class _xD, class _xE>
-  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS, Agent::ptx_plan::MIN_BLOCKS)
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
   _kernel_agent_vshmem(char* vshmem, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC, _xD xD, _xE xE)
   {
     extern __shared__ char shmem[];
@@ -379,7 +375,6 @@ namespace core {
     size_t          count;
     cudaStream_t    stream;
     char const*     name;
-    bool            debug_sync;
     unsigned int    grid;
     char*           vshmem;
     bool            has_shmem;
@@ -397,17 +392,15 @@ namespace core {
                                   MAX_SHMEM_PER_BLOCK> shm1;
 
     template <class Size>
-    CUB_RUNTIME_FUNCTION
+    THRUST_RUNTIME_FUNCTION
     AgentLauncher(AgentPlan    plan_,
                   Size         count_,
                   cudaStream_t stream_,
-                  char const*  name_,
-                  bool         debug_sync_)
+                  char const*  name_)
         : plan(plan_),
           count((size_t)count_),
           stream(stream_),
           name(name_),
-          debug_sync(debug_sync_),
           grid(static_cast<unsigned int>((count + plan.items_per_tile - 1) / plan.items_per_tile)),
           vshmem(NULL),
           has_shmem((size_t)core::get_max_shared_memory_per_block() >= (size_t)plan.shared_memory_size),
@@ -417,18 +410,16 @@ namespace core {
     }
 
     template <class Size>
-    CUB_RUNTIME_FUNCTION
+    THRUST_RUNTIME_FUNCTION
     AgentLauncher(AgentPlan    plan_,
                   Size         count_,
                   cudaStream_t stream_,
                   char*        vshmem,
-                  char const*  name_,
-                  bool         debug_sync_)
+                  char const*  name_)
         : plan(plan_),
           count((size_t)count_),
           stream(stream_),
           name(name_),
-          debug_sync(debug_sync_),
           grid(static_cast<unsigned int>((count + plan.items_per_tile - 1) / plan.items_per_tile)),
           vshmem(vshmem),
           has_shmem((size_t)core::get_max_shared_memory_per_block() >= (size_t)plan.shared_memory_size),
@@ -436,17 +427,15 @@ namespace core {
     {
       assert(count > 0);
     }
-    
-    CUB_RUNTIME_FUNCTION
+
+    THRUST_RUNTIME_FUNCTION
     AgentLauncher(AgentPlan    plan_,
                   cudaStream_t stream_,
-                  char const*  name_,
-                  bool         debug_sync_)
+                  char const*  name_)
         : plan(plan_),
           count(0),
           stream(stream_),
           name(name_),
-          debug_sync(debug_sync_),
           grid(plan.grid_size),
           vshmem(NULL),
           has_shmem((size_t)core::get_max_shared_memory_per_block() >= (size_t)plan.shared_memory_size),
@@ -455,17 +444,15 @@ namespace core {
       assert(plan.grid_size > 0);
     }
 
-    CUB_RUNTIME_FUNCTION
+    THRUST_RUNTIME_FUNCTION
     AgentLauncher(AgentPlan    plan_,
                   cudaStream_t stream_,
                   char*        vshmem,
-                  char const*  name_,
-                  bool         debug_sync_)
+                  char const*  name_)
         : plan(plan_),
           count(0),
           stream(stream_),
           name(name_),
-          debug_sync(debug_sync_),
           grid(plan.grid_size),
           vshmem(vshmem),
           has_shmem((size_t)core::get_max_shared_memory_per_block() >= (size_t)plan.shared_memory_size),
@@ -488,9 +475,7 @@ namespace core {
 #ifdef __CUDACC_RDC__
       return core::get_agent_plan<Agent>(s, d_ptr);
 #else
-      core::cuda_optional<int> ptx_version = core::get_ptx_version();
-      //CUDA_CUB_RET_IF_FAIL(ptx_version.status());
-      return get_agent_plan<Agent>(ptx_version);
+      return get_agent_plan<Agent>(core::get_ptx_version());
 #endif
     }
     THRUST_RUNTIME_FUNCTION
@@ -499,31 +484,23 @@ namespace core {
       return get_agent_plan<Agent>(sm_arch<0>::type::ver);
     }
 #endif
-    
-    CUB_RUNTIME_FUNCTION
+
+    THRUST_RUNTIME_FUNCTION
     typename core::get_plan<Agent>::type static get_plan(cudaStream_t , void* d_ptr = 0)
     {
       THRUST_UNUSED_VAR(d_ptr);
-      core::cuda_optional<int> ptx_version = core::get_ptx_version();
-      return get_agent_plan<Agent>(ptx_version);
+      return get_agent_plan<Agent>(core::get_ptx_version());
     }
-    
+
     THRUST_RUNTIME_FUNCTION
     typename core::get_plan<Agent>::type static get_plan()
     {
       return get_agent_plan<Agent>(lowest_supported_sm_arch::ver);
     }
 
-    CUB_RUNTIME_FUNCTION void sync() const
+    THRUST_RUNTIME_FUNCTION void sync() const
     {
-      if (debug_sync)
-      {
-#ifdef __CUDA_ARCH__
-        cudaDeviceSynchronize();
-#else
-        cudaStreamSynchronize(stream);
-#endif
-      }
+      CubDebug(cub::detail::DebugSyncStream(stream));
     }
 
     template<class K>
@@ -542,44 +519,43 @@ namespace core {
       return max_blocks_per_sm_impl(k, plan.block_threads);
     }
 
-
-    
     template<class K>
     THRUST_RUNTIME_FUNCTION
     void print_info(K k) const
     {
-      if (debug_sync)
+      #if THRUST_DEBUG_SYNC_FLAG 
+      cuda_optional<int> occ = max_sm_occupancy(k);
+      const int ptx_version = core::get_ptx_version();
+      if (count > 0)
+      {
+        _CubLog("Invoking %s<<<%u, %d, %d, %lld>>>(), %llu items total, %d items per thread, %d SM occupancy, %d vshmem size, %d ptx_version \n",
+                name,
+                grid,
+                plan.block_threads,
+                (has_shmem ? (int)plan.shared_memory_size : 0),
+                (long long)stream,
+                (long long)count,
+                plan.items_per_thread,
+                (int)occ,
+                (!has_shmem ? (int)plan.shared_memory_size : 0),
+                (int)ptx_version);
+      }
+      else
       {
-        cuda_optional<int> occ = max_sm_occupancy(k);
-        core::cuda_optional<int> ptx_version = core::get_ptx_version();
-        if (count > 0)
-        {
-          _CubLog("Invoking %s<<<%u, %d, %d, %lld>>>(), %llu items total, %d items per thread, %d SM occupancy, %d vshmem size, %d ptx_version \n",
-                  name,
-                  grid,
-                  plan.block_threads,
-                  (has_shmem ? (int)plan.shared_memory_size : 0),
-                  (long long)stream,
-                  (long long)count,
-                  plan.items_per_thread,
-                  (int)occ,
-                  (!has_shmem ? (int)plan.shared_memory_size : 0),
-                  (int)ptx_version);
-        }
-        else
-        {
-          _CubLog("Invoking %s<<<%u, %d, %d, %lld>>>(), %d items per thread, %d SM occupancy, %d vshmem size, %d ptx_version\n",
-                  name,
-                  grid,
-                  plan.block_threads,
-                  (has_shmem ? (int)plan.shared_memory_size : 0),
-                  (long long)stream,
-                  plan.items_per_thread,
-                  (int)occ,
-                  (!has_shmem ? (int)plan.shared_memory_size : 0),
-                  (int)ptx_version);
-        }
+        _CubLog("Invoking %s<<<%u, %d, %d, %lld>>>(), %d items per thread, %d SM occupancy, %d vshmem size, %d ptx_version\n",
+                name,
+                grid,
+                plan.block_threads,
+                (has_shmem ? (int)plan.shared_memory_size : 0),
+                (long long)stream,
+                plan.items_per_thread,
+                (int)occ,
+                (!has_shmem ? (int)plan.shared_memory_size : 0),
+                (int)ptx_version);
       }
+      #else
+      (void)k;
+      #endif
     }
 
     ////////////////////
@@ -705,11 +681,11 @@ namespace core {
 
 #if 0
 
-    // If we are guaranteed to have enough shared memory 
+    // If we are guaranteed to have enough shared memory
     // don't compile other kernel which accepts pointer
     // and save on compilations
     template <class... Args>
-    void CUB_RUNTIME_FUNCTION
+    void THRUST_RUNTIME_FUNCTION
     launch_impl(thrust::detail::true_type, Args... args) const
     {
       assert(has_shmem && vshmem == NULL);
@@ -717,17 +693,17 @@ namespace core {
       launcher::triple_chevron(grid, plan.block_threads, shmem_size, stream)
           .doit(_kernel_agent<Agent, Args...>, args...);
     }
-    
-    // If there is a risk of not having enough shared memory 
+
+    // If there is a risk of not having enough shared memory
     // we compile generic kernel instead.
     // This kernel is likely to be somewhat slower, but it can accomodate
     // both shared and virtualized shared memories.
     // Alternative option is to compile two kernels, one using shared and one
     // using virtualized shared memory. While this can be slightly faster if we
     // do actually have enough shared memory, the compilation time will double.
-    // 
+    //
     template <class... Args>
-    void CUB_RUNTIME_FUNCTION
+    void THRUST_RUNTIME_FUNCTION
     launch_impl(thrust::detail::false_type, Args... args) const
     {
       assert((has_shmem && vshmem == NULL) || (!has_shmem && vshmem != NULL && shmem_size == 0));
@@ -737,25 +713,15 @@ namespace core {
     }
 
     template <class... Args>
-    void CUB_RUNTIME_FUNCTION
+    void THRUST_RUNTIME_FUNCTION
     launch(Args... args) const
     {
-#if __THRUST__TEMPLATE_DEBUG
-#ifdef __CUDA_ARCH__
-      typedef typename Foo<
-        shm1::v1,
-        shm1::v2,
-        shm1::v3,
-        shm1::v4,
-        shm1::v5>::t tt;
-#endif
-#endif
       launch_impl(has_enough_shmem_t(),args...);
       sync();
     }
 #else
     template <class _0>
-    void CUB_RUNTIME_FUNCTION
+    void THRUST_RUNTIME_FUNCTION
     launch_impl(thrust::detail::false_type, _0 x0) const
     {
       assert((has_shmem && vshmem == NULL) || (!has_shmem && vshmem != NULL && shmem_size == 0));
@@ -765,7 +731,7 @@ namespace core {
           .doit(ptr, vshmem, x0);
     }
     template <class _0, class _1>
-    void CUB_RUNTIME_FUNCTION
+    void THRUST_RUNTIME_FUNCTION
     launch_impl(thrust::detail::false_type, _0 x0, _1 x1) const
     {
       assert((has_shmem && vshmem == NULL) || (!has_shmem && vshmem != NULL && shmem_size == 0));
@@ -775,7 +741,7 @@ namespace core {
           .doit(ptr, vshmem, x0, x1);
     }
     template <class _0, class _1, class _2>
-    void CUB_RUNTIME_FUNCTION
+    void THRUST_RUNTIME_FUNCTION
     launch_impl(thrust::detail::false_type, _0 x0, _1 x1, _2 x2) const
     {
       assert((has_shmem && vshmem == NULL) || (!has_shmem && vshmem != NULL && shmem_size == 0));
@@ -785,7 +751,7 @@ namespace core {
           .doit(ptr, vshmem, x0, x1, x2);
     }
     template <class _0, class _1, class _2, class _3>
-    void CUB_RUNTIME_FUNCTION
+    void THRUST_RUNTIME_FUNCTION
     launch_impl(thrust::detail::false_type, _0 x0, _1 x1, _2 x2, _3 x3) const
     {
       assert((has_shmem && vshmem == NULL) || (!has_shmem && vshmem != NULL && shmem_size == 0));
@@ -795,7 +761,7 @@ namespace core {
           .doit(ptr, vshmem, x0, x1, x2, x3);
     }
     template <class _0, class _1, class _2, class _3, class _4>
-    void CUB_RUNTIME_FUNCTION
+    void THRUST_RUNTIME_FUNCTION
     launch_impl(thrust::detail::false_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4) const
     {
       assert((has_shmem && vshmem == NULL) || (!has_shmem && vshmem != NULL && shmem_size == 0));
@@ -805,7 +771,7 @@ namespace core {
           .doit(ptr, vshmem, x0, x1, x2, x3, x4);
     }
     template <class _0, class _1, class _2, class _3, class _4, class _5>
-    void CUB_RUNTIME_FUNCTION
+    void THRUST_RUNTIME_FUNCTION
     launch_impl(thrust::detail::false_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5) const
     {
       assert((has_shmem && vshmem == NULL) || (!has_shmem && vshmem != NULL && shmem_size == 0));
@@ -815,7 +781,7 @@ namespace core {
           .doit(ptr, vshmem, x0, x1, x2, x3, x4, x5);
     }
     template <class _0, class _1, class _2, class _3, class _4, class _5, class _6>
-    void CUB_RUNTIME_FUNCTION
+    void THRUST_RUNTIME_FUNCTION
     launch_impl(thrust::detail::false_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6) const
     {
       assert((has_shmem && vshmem == NULL) || (!has_shmem && vshmem != NULL && shmem_size == 0));
@@ -825,7 +791,7 @@ namespace core {
           .doit(ptr, vshmem, x0, x1, x2, x3, x4, x5, x6);
     }
     template <class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7>
-    void CUB_RUNTIME_FUNCTION
+    void THRUST_RUNTIME_FUNCTION
     launch_impl(thrust::detail::false_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7) const
     {
       assert((has_shmem && vshmem == NULL) || (!has_shmem && vshmem != NULL && shmem_size == 0));
@@ -835,7 +801,7 @@ namespace core {
           .doit(ptr, vshmem, x0, x1, x2, x3, x4, x5, x6, x7);
     }
     template <class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8>
-    void CUB_RUNTIME_FUNCTION
+    void THRUST_RUNTIME_FUNCTION
     launch_impl(thrust::detail::false_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8) const
     {
       assert((has_shmem && vshmem == NULL) || (!has_shmem && vshmem != NULL && shmem_size == 0));
@@ -844,7 +810,7 @@ namespace core {
           .doit(ptr, vshmem, x0, x1, x2, x3, x4, x5, x6, x7, x8);
     }
     template <class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9>
-    void CUB_RUNTIME_FUNCTION
+    void THRUST_RUNTIME_FUNCTION
     launch_impl(thrust::detail::false_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9) const
     {
       assert((has_shmem && vshmem == NULL) || (!has_shmem && vshmem != NULL && shmem_size == 0));
@@ -854,7 +820,7 @@ namespace core {
           .doit(ptr, vshmem, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9);
     }
     template <class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA>
-    void CUB_RUNTIME_FUNCTION
+    void THRUST_RUNTIME_FUNCTION
     launch_impl(thrust::detail::false_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9,_xA xA) const
     {
       assert((has_shmem && vshmem == NULL) || (!has_shmem && vshmem != NULL && shmem_size == 0));
@@ -864,7 +830,7 @@ namespace core {
           .doit(ptr, vshmem, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA);
     }
     template <class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB>
-    void CUB_RUNTIME_FUNCTION
+    void THRUST_RUNTIME_FUNCTION
     launch_impl(thrust::detail::false_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9,_xA xA,_xB xB) const
     {
       assert((has_shmem && vshmem == NULL) || (!has_shmem && vshmem != NULL && shmem_size == 0));
@@ -874,7 +840,7 @@ namespace core {
           .doit(ptr, vshmem, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB);
     }
     template <class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC>
-    void CUB_RUNTIME_FUNCTION
+    void THRUST_RUNTIME_FUNCTION
     launch_impl(thrust::detail::false_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9,_xA xA,_xB xB,_xC xC) const
     {
       assert((has_shmem && vshmem == NULL) || (!has_shmem && vshmem != NULL && shmem_size == 0));
@@ -884,7 +850,7 @@ namespace core {
           .doit(ptr, vshmem, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC);
     }
     template <class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC, class _xD>
-    void CUB_RUNTIME_FUNCTION
+    void THRUST_RUNTIME_FUNCTION
     launch_impl(thrust::detail::false_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9,_xA xA,_xB xB,_xC xC,_xD xD) const
     {
       assert((has_shmem && vshmem == NULL) || (!has_shmem && vshmem != NULL && shmem_size == 0));
@@ -894,7 +860,7 @@ namespace core {
           .doit(ptr, vshmem, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, xD);
     }
     template <class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC, class _xD, class _xE>
-    void CUB_RUNTIME_FUNCTION
+    void THRUST_RUNTIME_FUNCTION
     launch_impl(thrust::detail::false_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9,_xA xA,_xB xB,_xC xC,_xD xD,_xE xE) const
     {
       assert((has_shmem && vshmem == NULL) || (!has_shmem && vshmem != NULL && shmem_size == 0));
@@ -909,7 +875,7 @@ namespace core {
     ////////////////////////////////////////////////////////
 
     template <class _0>
-    void CUB_RUNTIME_FUNCTION
+    void THRUST_RUNTIME_FUNCTION
     launch_impl(thrust::detail::true_type, _0 x0) const
     {
       assert(has_shmem && vshmem == NULL);
@@ -919,7 +885,7 @@ namespace core {
           .doit(ptr, x0);
     }
     template <class _0, class _1>
-    void CUB_RUNTIME_FUNCTION
+    void THRUST_RUNTIME_FUNCTION
     launch_impl(thrust::detail::true_type, _0 x0, _1 x1) const
     {
       assert(has_shmem && vshmem == NULL);
@@ -929,7 +895,7 @@ namespace core {
           .doit(ptr, x0, x1);
     }
     template <class _0, class _1, class _2>
-    void CUB_RUNTIME_FUNCTION
+    void THRUST_RUNTIME_FUNCTION
     launch_impl(thrust::detail::true_type, _0 x0, _1 x1, _2 x2) const
     {
       assert(has_shmem && vshmem == NULL);
@@ -939,7 +905,7 @@ namespace core {
           .doit(ptr, x0, x1, x2);
     }
     template <class _0, class _1, class _2, class _3>
-    void CUB_RUNTIME_FUNCTION
+    void THRUST_RUNTIME_FUNCTION
     launch_impl(thrust::detail::true_type, _0 x0, _1 x1, _2 x2, _3 x3) const
     {
       assert(has_shmem && vshmem == NULL);
@@ -949,7 +915,7 @@ namespace core {
           .doit(ptr, x0, x1, x2, x3);
     }
     template <class _0, class _1, class _2, class _3, class _4>
-    void CUB_RUNTIME_FUNCTION
+    void THRUST_RUNTIME_FUNCTION
     launch_impl(thrust::detail::true_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4) const
     {
       assert(has_shmem && vshmem == NULL);
@@ -959,7 +925,7 @@ namespace core {
           .doit(ptr, x0, x1, x2, x3, x4);
     }
     template <class _0, class _1, class _2, class _3, class _4, class _5>
-    void CUB_RUNTIME_FUNCTION
+    void THRUST_RUNTIME_FUNCTION
     launch_impl(thrust::detail::true_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5) const
     {
       assert(has_shmem && vshmem == NULL);
@@ -969,7 +935,7 @@ namespace core {
           .doit(ptr, x0, x1, x2, x3, x4, x5);
     }
     template <class _0, class _1, class _2, class _3, class _4, class _5, class _6>
-    void CUB_RUNTIME_FUNCTION
+    void THRUST_RUNTIME_FUNCTION
     launch_impl(thrust::detail::true_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6) const
     {
       assert(has_shmem && vshmem == NULL);
@@ -979,7 +945,7 @@ namespace core {
           .doit(ptr, x0, x1, x2, x3, x4, x5, x6);
     }
     template <class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7>
-    void CUB_RUNTIME_FUNCTION
+    void THRUST_RUNTIME_FUNCTION
     launch_impl(thrust::detail::true_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7) const
     {
       assert(has_shmem && vshmem == NULL);
@@ -989,7 +955,7 @@ namespace core {
           .doit(ptr, x0, x1, x2, x3, x4, x5, x6, x7);
     }
     template <class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8>
-    void CUB_RUNTIME_FUNCTION
+    void THRUST_RUNTIME_FUNCTION
     launch_impl(thrust::detail::true_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8) const
     {
       assert(has_shmem && vshmem == NULL);
@@ -999,7 +965,7 @@ namespace core {
           .doit(ptr, x0, x1, x2, x3, x4, x5, x6, x7, x8);
     }
     template <class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9>
-    void CUB_RUNTIME_FUNCTION
+    void THRUST_RUNTIME_FUNCTION
     launch_impl(thrust::detail::true_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9) const
     {
       assert(has_shmem && vshmem == NULL);
@@ -1009,7 +975,7 @@ namespace core {
           .doit(ptr, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9);
     }
     template <class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA>
-    void CUB_RUNTIME_FUNCTION
+    void THRUST_RUNTIME_FUNCTION
     launch_impl(thrust::detail::true_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA) const
     {
       assert(has_shmem && vshmem == NULL);
@@ -1019,7 +985,7 @@ namespace core {
           .doit(ptr, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA);
     }
     template <class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB>
-    void CUB_RUNTIME_FUNCTION
+    void THRUST_RUNTIME_FUNCTION
     launch_impl(thrust::detail::true_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB) const
     {
       assert(has_shmem && vshmem == NULL);
@@ -1029,7 +995,7 @@ namespace core {
           .doit(ptr, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB);
     }
     template <class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC>
-    void CUB_RUNTIME_FUNCTION
+    void THRUST_RUNTIME_FUNCTION
     launch_impl(thrust::detail::true_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC) const
     {
       assert(has_shmem && vshmem == NULL);
@@ -1039,7 +1005,7 @@ namespace core {
           .doit(ptr, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC);
     }
     template <class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC, class _xD>
-    void CUB_RUNTIME_FUNCTION
+    void THRUST_RUNTIME_FUNCTION
     launch_impl(thrust::detail::true_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC, _xD xD) const
     {
       assert(has_shmem && vshmem == NULL);
@@ -1049,7 +1015,7 @@ namespace core {
           .doit(ptr, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, xD);
     }
     template <class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC, class _xD, class _xE>
-    void CUB_RUNTIME_FUNCTION
+    void THRUST_RUNTIME_FUNCTION
     launch_impl(thrust::detail::true_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC, _xD xD, _xE xE) const
     {
       assert(has_shmem && vshmem == NULL);
@@ -1062,107 +1028,107 @@ namespace core {
     ////////////////////////////////////////////////////////
     ////////////////////////////////////////////////////////
     ////////////////////////////////////////////////////////
-    
+
     template <class _0>
-    void CUB_RUNTIME_FUNCTION
+    void THRUST_RUNTIME_FUNCTION
     launch(_0 x0) const
     {
       launch_impl(has_enough_shmem_t(), x0);
       sync();
     }
     template <class _0, class _1>
-    void CUB_RUNTIME_FUNCTION
+    void THRUST_RUNTIME_FUNCTION
     launch(_0 x0, _1 x1) const
     {
       launch_impl(has_enough_shmem_t(), x0, x1);
       sync();
     }
     template <class _0, class _1, class _2>
-    void CUB_RUNTIME_FUNCTION
+    void THRUST_RUNTIME_FUNCTION
     launch(_0 x0, _1 x1, _2 x2) const
     {
       launch_impl(has_enough_shmem_t(), x0, x1, x2);
       sync();
     }
     template <class _0, class _1, class _2, class _3>
-    void CUB_RUNTIME_FUNCTION
+    void THRUST_RUNTIME_FUNCTION
     launch(_0 x0, _1 x1, _2 x2, _3 x3) const
     {
       launch_impl(has_enough_shmem_t(), x0, x1, x2, x3);
       sync();
     }
     template <class _0, class _1, class _2, class _3, class _4>
-    void CUB_RUNTIME_FUNCTION
+    void THRUST_RUNTIME_FUNCTION
     launch(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4) const
     {
       launch_impl(has_enough_shmem_t(), x0, x1, x2, x3, x4);
       sync();
     }
     template <class _0, class _1, class _2, class _3, class _4, class _5>
-    void CUB_RUNTIME_FUNCTION
+    void THRUST_RUNTIME_FUNCTION
     launch(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5) const
     {
       launch_impl(has_enough_shmem_t(), x0, x1, x2, x3, x4, x5);
       sync();
     }
     template <class _0, class _1, class _2, class _3, class _4, class _5, class _6>
-    void CUB_RUNTIME_FUNCTION
+    void THRUST_RUNTIME_FUNCTION
     launch(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6) const
     {
       launch_impl(has_enough_shmem_t(), x0, x1, x2, x3, x4, x5, x6);
       sync();
     }
     template <class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7>
-    void CUB_RUNTIME_FUNCTION
+    void THRUST_RUNTIME_FUNCTION
     launch(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7) const
     {
       launch_impl(has_enough_shmem_t(), x0, x1, x2, x3, x4, x5, x6, x7);
       sync();
     }
     template <class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8>
-    void CUB_RUNTIME_FUNCTION
+    void THRUST_RUNTIME_FUNCTION
     launch(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8) const
     {
       launch_impl(has_enough_shmem_t(), x0, x1, x2, x3, x4, x5, x6, x7, x8);
       sync();
     }
     template <class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9>
-    void CUB_RUNTIME_FUNCTION
+    void THRUST_RUNTIME_FUNCTION
     launch(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9) const
     {
       launch_impl(has_enough_shmem_t(), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9);
       sync();
     }
     template <class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA>
-    void CUB_RUNTIME_FUNCTION
+    void THRUST_RUNTIME_FUNCTION
     launch(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA) const
     {
       launch_impl(has_enough_shmem_t(), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA);
       sync();
     }
     template <class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB>
-    void CUB_RUNTIME_FUNCTION
+    void THRUST_RUNTIME_FUNCTION
     launch(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB) const
     {
       launch_impl(has_enough_shmem_t(), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB);
       sync();
     }
     template <class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC>
-    void CUB_RUNTIME_FUNCTION
+    void THRUST_RUNTIME_FUNCTION
     launch(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC) const
     {
       launch_impl(has_enough_shmem_t(), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC);
       sync();
     }
     template <class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC, class _xD>
-    void CUB_RUNTIME_FUNCTION
+    void THRUST_RUNTIME_FUNCTION
     launch(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC, _xD xD) const
     {
       launch_impl(has_enough_shmem_t(), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, xD);
       sync();
     }
     template <class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC, class _xD, class _xE>
-    void CUB_RUNTIME_FUNCTION
+    void THRUST_RUNTIME_FUNCTION
     launch(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC, _xD xD, _xE xE) const
     {
       launch_impl(has_enough_shmem_t(), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, xD, xE);
@@ -1175,5 +1141,5 @@ namespace core {
 
 }    // namespace core
 }
-THRUST_END_NS
+THRUST_NAMESPACE_END
 #endif
diff --git a/thrust/system/cuda/detail/core/alignment.h b/thrust/system/cuda/detail/core/alignment.h
index bf3873efe..4b807ebc1 100644
--- a/thrust/system/cuda/detail/core/alignment.h
+++ b/thrust/system/cuda/detail/core/alignment.h
@@ -18,9 +18,11 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/system/cuda/detail/util.h>
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
 namespace cuda_cub {
 namespace alignment_of_detail {
 
@@ -245,4 +247,4 @@ struct aligned_storage
 
 }    // end cuda_
 
-THRUST_END_NS
+THRUST_NAMESPACE_END
diff --git a/thrust/system/cuda/detail/core/triple_chevron_launch.h b/thrust/system/cuda/detail/core/triple_chevron_launch.h
index 8ed5fd5f2..65a7283b7 100644
--- a/thrust/system/cuda/detail/core/triple_chevron_launch.h
+++ b/thrust/system/cuda/detail/core/triple_chevron_launch.h
@@ -32,7 +32,7 @@
 #include <cassert>
 
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
 
 namespace cuda_cub {
 namespace launcher {
@@ -45,7 +45,7 @@ namespace launcher {
     Size const shared_mem;
     cudaStream_t const stream;
 
-    CUB_RUNTIME_FUNCTION
+    THRUST_RUNTIME_FUNCTION
     triple_chevron(dim3         grid_,
                    dim3         block_,
                    Size         shared_mem_ = 0,
@@ -55,7 +55,6 @@ namespace launcher {
           shared_mem(shared_mem_),
           stream(stream_) {}
 
-#if 0
     template<class K, class... Args>
     cudaError_t __host__
     doit_host(K k, Args const&... args) const
@@ -63,120 +62,6 @@ namespace launcher {
       k<<<grid, block, shared_mem, stream>>>(args...);
       return cudaPeekAtLastError();
     }
-#else
-    template <class K, class _0>
-    cudaError_t __host__
-    doit_host(K k, _0 x0) const
-    {
-      k<<<grid, block, shared_mem, stream>>>(x0);
-      return cudaPeekAtLastError();
-    }
-    template <class K, class _0, class _1>
-    cudaError_t __host__
-    doit_host(K k, _0 x0, _1 x1) const
-    {
-      k<<<grid, block, shared_mem, stream>>>(x0,x1);
-      return cudaPeekAtLastError();
-    }
-    template <class K, class _0, class _1, class _2>
-    cudaError_t __host__
-    doit_host(K k, _0 x0, _1 x1, _2 x2) const
-    {
-      k<<<grid, block, shared_mem, stream>>>(x0,x1,x2);
-      return cudaPeekAtLastError();
-    }
-    template <class K, class _0, class _1, class _2, class _3>
-    cudaError_t __host__
-    doit_host(K k, _0 x0, _1 x1, _2 x2, _3 x3) const
-    {
-      k<<<grid, block, shared_mem, stream>>>(x0,x1,x2,x3);
-      return cudaPeekAtLastError();
-    }
-    template <class K, class _0, class _1, class _2, class _3, class _4>
-    cudaError_t __host__
-    doit_host(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4) const
-    {
-      k<<<grid, block, shared_mem, stream>>>(x0,x1,x2,x3,x4);
-      return cudaPeekAtLastError();
-    }
-    template <class K, class _0, class _1, class _2, class _3, class _4, class _5>
-    cudaError_t __host__
-    doit_host(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5) const
-    {
-      k<<<grid, block, shared_mem, stream>>>(x0,x1,x2,x3,x4,x5);
-      return cudaPeekAtLastError();
-    }
-    template <class K, class _0, class _1, class _2, class _3, class _4, class _5, class _6>
-    cudaError_t __host__
-    doit_host(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6) const
-    {
-      k<<<grid, block, shared_mem, stream>>>(x0,x1,x2,x3,x4,x5,x6);
-      return cudaPeekAtLastError();
-    }
-    template <class K, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7>
-    cudaError_t __host__
-    doit_host(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7) const
-    {
-      k<<<grid, block, shared_mem, stream>>>(x0,x1,x2,x3,x4,x5,x6,x7);
-      return cudaPeekAtLastError();
-    }
-    template <class K, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8>
-    cudaError_t __host__
-    doit_host(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8) const
-    {
-      k<<<grid, block, shared_mem, stream>>>(x0,x1,x2,x3,x4,x5,x6,x7,x8);
-      return cudaPeekAtLastError();
-    }
-    template <class K, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9>
-    cudaError_t __host__
-    doit_host(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9) const
-    {
-      k<<<grid, block, shared_mem, stream>>>(x0,x1,x2,x3,x4,x5,x6,x7,x8,x9);
-      return cudaPeekAtLastError();
-    }
-    template <class K, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA>
-    cudaError_t __host__
-    doit_host(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA) const
-    {
-      k<<<grid, block, shared_mem, stream>>>(x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA);
-      return cudaPeekAtLastError();
-    }
-    template <class K, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB>
-    cudaError_t __host__
-    doit_host(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB) const
-    {
-      k<<<grid, block, shared_mem, stream>>>(x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA,xB);
-      return cudaPeekAtLastError();
-    }
-    template <class K, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC>
-    cudaError_t __host__
-    doit_host(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC) const
-    {
-      k<<<grid, block, shared_mem, stream>>>(x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA,xB,xC);
-      return cudaPeekAtLastError();
-    }
-    template <class K, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC, class _xD>
-    cudaError_t __host__
-    doit_host(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC, _xD xD) const
-    {
-      k<<<grid, block, shared_mem, stream>>>(x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA,xB,xC,xD);
-      return cudaPeekAtLastError();
-    }
-    template <class K, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC, class _xD, class _xE>
-    cudaError_t __host__
-    doit_host(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC, _xD xD, _xE xE) const
-    {
-      k<<<grid, block, shared_mem, stream>>>(x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA,xB,xC,xD,xE);
-      return cudaPeekAtLastError();
-    }
-    template <class K, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC, class _xD, class _xE, class _xF>
-    cudaError_t __host__
-    doit_host(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC, _xD xD, _xE xE, _xF xF) const
-    {
-      k<<<grid, block, shared_mem, stream>>>(x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA,xB,xC,xD,xE,xF);
-      return cudaPeekAtLastError();
-    }
-#endif
 
     template<class T>
     size_t __device__
@@ -186,7 +71,6 @@ namespace launcher {
       return alignment * ((offset + (alignment - 1))/ alignment);
     }
 
-#if 0
     size_t __device__ argument_pack_size(size_t size) const { return size; }
     template <class Arg, class... Args>
     size_t __device__
@@ -195,110 +79,6 @@ namespace launcher {
       size = align_up<Arg>(size);
       return argument_pack_size(size + sizeof(Arg), args...);
     }
-#else
-    template <class Arg>
-    size_t __device__
-    argument_pack_size(size_t size, Arg) const
-    {
-      return align_up<Arg>(size) + sizeof(Arg);
-    }
-    template <class Arg, class _0>
-    size_t __device__
-    argument_pack_size(size_t size, Arg, _0 x0) const
-    {
-      return argument_pack_size(align_up<Arg>(size) + sizeof(Arg), x0);
-    }
-    template <class Arg, class _0, class _1>
-    size_t __device__
-    argument_pack_size(size_t size, Arg, _0 x0, _1 x1) const
-    {
-      return argument_pack_size(align_up<Arg>(size) + sizeof(Arg), x0, x1);
-    }
-    template <class Arg, class _0, class _1, class _2>
-    size_t __device__
-    argument_pack_size(size_t size, Arg, _0 x0, _1 x1, _2 x2) const
-    {
-      return argument_pack_size(align_up<Arg>(size) + sizeof(Arg), x0, x1, x2);
-    }
-    template <class Arg, class _0, class _1, class _2, class _3>
-    size_t __device__
-    argument_pack_size(size_t size, Arg, _0 x0, _1 x1, _2 x2, _3 x3) const
-    {
-      return argument_pack_size(align_up<Arg>(size) + sizeof(Arg), x0, x1, x2, x3);
-    }
-    template <class Arg, class _0, class _1, class _2, class _3, class _4>
-    size_t __device__
-    argument_pack_size(size_t size, Arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4) const
-    {
-      return argument_pack_size(align_up<Arg>(size) + sizeof(Arg), x0, x1, x2, x3, x4);
-    }
-    template <class Arg, class _0, class _1, class _2, class _3, class _4, class _5>
-    size_t __device__
-    argument_pack_size(size_t size, Arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5) const
-    {
-      return argument_pack_size(align_up<Arg>(size) + sizeof(Arg), x0, x1, x2, x3, x4, x5);
-    }
-    template <class Arg, class _0, class _1, class _2, class _3, class _4, class _5, class _6>
-    size_t __device__
-    argument_pack_size(size_t size, Arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6) const
-    {
-      return argument_pack_size(align_up<Arg>(size) + sizeof(Arg), x0, x1, x2, x3, x4, x5, x6);
-    }
-    template <class Arg, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7>
-    size_t __device__
-    argument_pack_size(size_t size, Arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7) const
-    {
-      return argument_pack_size(align_up<Arg>(size) + sizeof(Arg), x0, x1, x2, x3, x4, x5, x6, x7);
-    }
-    template <class Arg, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8>
-    size_t __device__
-    argument_pack_size(size_t size, Arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8) const
-    {
-      return argument_pack_size(align_up<Arg>(size) + sizeof(Arg), x0, x1, x2, x3, x4, x5, x6, x7, x8);
-    }
-    template <class Arg, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9>
-    size_t __device__
-    argument_pack_size(size_t size, Arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9) const
-    {
-      return argument_pack_size(align_up<Arg>(size) + sizeof(Arg), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9);
-    }
-    template <class Arg, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA>
-    size_t __device__
-    argument_pack_size(size_t size, Arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA) const
-    {
-      return argument_pack_size(align_up<Arg>(size) + sizeof(Arg), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA);
-    }
-    template <class Arg, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB>
-    size_t __device__
-    argument_pack_size(size_t size, Arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB) const
-    {
-      return argument_pack_size(align_up<Arg>(size) + sizeof(Arg), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB);
-    }
-    template <class Arg, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC>
-    size_t __device__
-    argument_pack_size(size_t size, Arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC) const
-    {
-      return argument_pack_size(align_up<Arg>(size) + sizeof(Arg), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC);
-    }
-    template <class Arg, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC, class _xD>
-    size_t __device__
-    argument_pack_size(size_t size, Arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC,_xD xD) const
-    {
-      return argument_pack_size(align_up<Arg>(size) + sizeof(Arg), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, xD);
-    }
-    template <class Arg, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC, class _xD, class _xE>
-    size_t __device__
-    argument_pack_size(size_t size, Arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC,_xD xD, _xE xE) const
-    {
-      return argument_pack_size(align_up<Arg>(size) + sizeof(Arg), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, xD, xE);
-    }
-    template <class Arg, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC, class _xD, class _xE, class _xF>
-    size_t __device__
-    argument_pack_size(size_t size, Arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC,_xD xD, _xE xE, _xF xF) const
-    {
-      return argument_pack_size(align_up<Arg>(size) + sizeof(Arg), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, xD, xE, xF);
-    }
-#endif /* variadic */
 
     template <class Arg>
     size_t __device__ copy_arg(char* buffer, size_t offset, Arg arg) const
@@ -309,663 +89,64 @@ namespace launcher {
       return offset + sizeof(Arg);
     }
 
-#if 0
-    void __device__ fill_arguments(char*, size_t) const {}
+    __device__
+    void fill_arguments(char*, size_t) const
+    {}
+
     template<class Arg, class... Args>
-    void __device__
-    fill_arguments(char* buffer, size_t offset, Arg const& arg, Args const& ... args) const
+    __device__
+    void fill_arguments(char* buffer,
+                     size_t offset,
+                     Arg const& arg,
+                     Args const& ... args) const
     {
       fill_arguments(buffer, copy_arg(buffer, offset, arg), args...);
     }
-#else
-    template<class Arg>
-    void __device__
-    fill_arguments(char* buffer, size_t offset, Arg arg) const
-    {
-      copy_arg(buffer, offset, arg);
-    }
-    template<class Arg, class _0>
-    void __device__
-    fill_arguments(char* buffer, size_t offset, Arg arg, _0 x0) const
-    {
-      fill_arguments(buffer, copy_arg(buffer, offset, arg), x0);
-    }
-    template <class Arg, class _0, class _1>
-    void __device__
-    fill_arguments(char* buffer, size_t offset, Arg arg, _0 x0, _1 x1) const
-    {
-      fill_arguments(buffer, copy_arg(buffer, offset, arg), x0, x1);
-    }
-    template <class Arg, class _0, class _1, class _2>
-    void __device__
-    fill_arguments(char* buffer, size_t offset, Arg arg, _0 x0, _1 x1, _2 x2) const
-    {
-      fill_arguments(buffer, copy_arg(buffer, offset, arg), x0, x1, x2);
-    }
-    template <class Arg, class _0, class _1, class _2, class _3>
-    void __device__
-    fill_arguments(char* buffer, size_t offset, Arg arg, _0 x0, _1 x1, _2 x2, _3 x3) const
-    {
-      fill_arguments(buffer, copy_arg(buffer, offset, arg), x0, x1, x2, x3);
-    }
-    template <class Arg, class _0, class _1, class _2, class _3, class _4>
-    void __device__
-    fill_arguments(char* buffer, size_t offset, Arg arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4) const
-    {
-      fill_arguments(buffer, copy_arg(buffer, offset, arg), x0, x1, x2, x3, x4);
-    }
-    template <class Arg, class _0, class _1, class _2, class _3, class _4, class _5>
-    void __device__
-    fill_arguments(char* buffer, size_t offset, Arg arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5) const
-    {
-      fill_arguments(buffer, copy_arg(buffer, offset, arg), x0, x1, x2, x3, x4, x5);
-    }
-    template <class Arg, class _0, class _1, class _2, class _3, class _4, class _5, class _6>
-    void __device__
-    fill_arguments(char* buffer, size_t offset, Arg arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6) const
-    {
-      fill_arguments(buffer, copy_arg(buffer, offset, arg), x0, x1, x2, x3, x4, x5, x6);
-    }
-    template <class Arg, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7>
-    void __device__
-    fill_arguments(char* buffer, size_t offset, Arg arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7) const
-    {
-      fill_arguments(buffer, copy_arg(buffer, offset, arg), x0, x1, x2, x3, x4, x5, x6, x7);
-    }
-    template <class Arg, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8>
-    void __device__
-    fill_arguments(char* buffer, size_t offset, Arg arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8) const
-    {
-      fill_arguments(buffer, copy_arg(buffer, offset, arg), x0, x1, x2, x3, x4, x5, x6, x7, x8);
-    }
-    template <class Arg, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9>
-    void __device__
-    fill_arguments(char* buffer, size_t offset, Arg arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9) const
-    {
-      fill_arguments(buffer, copy_arg(buffer, offset, arg), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9);
-    }
-    template <class Arg, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA>
-    void __device__
-    fill_arguments(char* buffer, size_t offset, Arg arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA) const
-    {
-      fill_arguments(buffer, copy_arg(buffer, offset, arg), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA);
-    }
-    template <class Arg, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB>
-    void __device__
-    fill_arguments(char* buffer, size_t offset, Arg arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB) const
-    {
-      fill_arguments(buffer, copy_arg(buffer, offset, arg), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB);
-    }
-    template <class Arg, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC>
-    void __device__
-    fill_arguments(char* buffer, size_t offset, Arg arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC) const
-    {
-      fill_arguments(buffer, copy_arg(buffer, offset, arg), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC);
-    }
-    template <class Arg, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC, class _xD>
-    void __device__
-    fill_arguments(char* buffer, size_t offset, Arg arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC,_xD xD) const
-    {
-      fill_arguments(buffer, copy_arg(buffer, offset, arg), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, xD);
-    }
-    template <class Arg, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC, class _xD, class _xE>
-    void __device__
-    fill_arguments(char* buffer, size_t offset, Arg arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC,_xD xD, _xE xE) const
-    {
-      fill_arguments(buffer, copy_arg(buffer, offset, arg), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, xD, xE);
-    }
-    template <class Arg, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC, class _xD, class _xE, class _xF>
-    void __device__
-    fill_arguments(char* buffer, size_t offset, Arg arg, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC,_xD xD, _xE xE, _xF xF) const
-    {
-      fill_arguments(buffer, copy_arg(buffer, offset, arg), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, xD, xE, xF);
-    }
-#endif /* variadic */
 
-#if 0
+    #ifdef THRUST_RDC_ENABLED
     template<class K, class... Args>
     cudaError_t __device__
     doit_device(K k, Args const&... args) const
     {
-      cudaError_t status = cudaErrorNotSupported;
-#if __THRUST_HAS_CUDART__
       const size_t size = argument_pack_size(0,args...);
       void *param_buffer = cudaGetParameterBuffer(64,size);
       fill_arguments((char*)param_buffer, 0, args...);
-      status = launch_device(k, param_buffer);
-#endif
-      return status;
-    }
-#else
-    template<class K, class _0>
-    cudaError_t __device__
-    doit_device(K k, _0 x0) const
-    {
-      cudaError_t status = cudaErrorNotSupported;
-#if __THRUST_HAS_CUDART__
-      const size_t size = argument_pack_size(0,x0);
-      void *param_buffer = cudaGetParameterBuffer(64,size);
-      fill_arguments((char*)param_buffer, 0, x0);
-      status = launch_device(k, param_buffer);
-#else
-      THRUST_UNUSED_VAR(k);
-      THRUST_UNUSED_VAR(x0);
-#endif
-      return status;
-    }
-    template <class K, class _0, class _1>
-    cudaError_t __device__
-    doit_device(K k, _0 x0, _1 x1) const
-    {
-      cudaError_t status = cudaErrorNotSupported;
-#if __THRUST_HAS_CUDART__
-      const size_t size = argument_pack_size(0,x0,x1);
-      void *param_buffer = cudaGetParameterBuffer(64,size);
-      fill_arguments((char*)param_buffer, 0, x0,x1);
-      status = launch_device(k, param_buffer);
-#else
-      THRUST_UNUSED_VAR(k);
-      THRUST_UNUSED_VAR(x0);
-      THRUST_UNUSED_VAR(x1);
-#endif
-      return status;
-    }
-    template <class K, class _0, class _1, class _2>
-    cudaError_t __device__
-    doit_device(K k, _0 x0, _1 x1, _2 x2) const
-    {
-      cudaError_t status = cudaErrorNotSupported;
-#if __THRUST_HAS_CUDART__
-      const size_t size = argument_pack_size(0,x0,x1,x2);
-      void *param_buffer = cudaGetParameterBuffer(64,size);
-      fill_arguments((char*)param_buffer, 0, x0,x1,x2);
-      status = launch_device(k, param_buffer);
-#else
-      THRUST_UNUSED_VAR(k);
-      THRUST_UNUSED_VAR(x0);
-      THRUST_UNUSED_VAR(x1);
-      THRUST_UNUSED_VAR(x2);
-#endif
-      return status;
-    }
-    template <class K, class _0, class _1, class _2, class _3>
-    cudaError_t __device__
-    doit_device(K k, _0 x0, _1 x1, _2 x2, _3 x3) const
-    {
-      cudaError_t status = cudaErrorNotSupported;
-#if __THRUST_HAS_CUDART__
-      const size_t size = argument_pack_size(0,x0,x1,x2,x3);
-      void *param_buffer = cudaGetParameterBuffer(64,size);
-      fill_arguments((char*)param_buffer, 0, x0,x1,x2,x3);
-      status = launch_device(k, param_buffer);
-#else
-      THRUST_UNUSED_VAR(k);
-      THRUST_UNUSED_VAR(x0);
-      THRUST_UNUSED_VAR(x1);
-      THRUST_UNUSED_VAR(x2);
-      THRUST_UNUSED_VAR(x3);
-#endif
-      return status;
-    }
-    template <class K, class _0, class _1, class _2, class _3, class _4>
-    cudaError_t __device__
-    doit_device(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4) const
-    {
-      cudaError_t status = cudaErrorNotSupported;
-#if __THRUST_HAS_CUDART__
-      const size_t size = argument_pack_size(0,x0,x1,x2,x3,x4);
-      void *param_buffer = cudaGetParameterBuffer(64,size);
-      fill_arguments((char*)param_buffer, 0, x0,x1,x2,x3,x4);
-      status = launch_device(k, param_buffer);
-#else
-      THRUST_UNUSED_VAR(k);
-      THRUST_UNUSED_VAR(x0);
-      THRUST_UNUSED_VAR(x1);
-      THRUST_UNUSED_VAR(x2);
-      THRUST_UNUSED_VAR(x3);
-      THRUST_UNUSED_VAR(x4);
-#endif
-      return status;
-    }
-    template <class K, class _0, class _1, class _2, class _3, class _4, class _5>
-    cudaError_t __device__
-    doit_device(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5) const
-    {
-      cudaError_t status = cudaErrorNotSupported;
-#if __THRUST_HAS_CUDART__
-      const size_t size = argument_pack_size(0,x0,x1,x2,x3,x4,x5);
-      void *param_buffer = cudaGetParameterBuffer(64,size);
-      fill_arguments((char*)param_buffer, 0, x0,x1,x2,x3,x4,x5);
-      status = launch_device(k, param_buffer);
-#else
-      THRUST_UNUSED_VAR(k);
-      THRUST_UNUSED_VAR(x0);
-      THRUST_UNUSED_VAR(x1);
-      THRUST_UNUSED_VAR(x2);
-      THRUST_UNUSED_VAR(x3);
-      THRUST_UNUSED_VAR(x4);
-      THRUST_UNUSED_VAR(x5);
-#endif
-      return status;
-    }
-    template <class K, class _0, class _1, class _2, class _3, class _4, class _5, class _6>
-    cudaError_t __device__
-    doit_device(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6) const
-    {
-      cudaError_t status = cudaErrorNotSupported;
-#if __THRUST_HAS_CUDART__
-      const size_t size = argument_pack_size(0,x0,x1,x2,x3,x4,x5,x6);
-      void *param_buffer = cudaGetParameterBuffer(64,size);
-      fill_arguments((char*)param_buffer, 0, x0,x1,x2,x3,x4,x5,x6);
-      status = launch_device(k, param_buffer);
-#else
-      THRUST_UNUSED_VAR(k);
-      THRUST_UNUSED_VAR(x0);
-      THRUST_UNUSED_VAR(x1);
-      THRUST_UNUSED_VAR(x2);
-      THRUST_UNUSED_VAR(x3);
-      THRUST_UNUSED_VAR(x4);
-      THRUST_UNUSED_VAR(x5);
-      THRUST_UNUSED_VAR(x6);
-#endif
-      return status;
-    }
-    template <class K, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7>
-    cudaError_t __device__
-    doit_device(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7) const
-    {
-      cudaError_t status = cudaErrorNotSupported;
-#if __THRUST_HAS_CUDART__
-      const size_t size = argument_pack_size(0,x0,x1,x2,x3,x4,x5,x6,x7);
-      void *param_buffer = cudaGetParameterBuffer(64,size);
-      fill_arguments((char*)param_buffer, 0, x0,x1,x2,x3,x4,x5,x6,x7);
-      status = launch_device(k, param_buffer);
-#else
-      THRUST_UNUSED_VAR(k);
-      THRUST_UNUSED_VAR(x0);
-      THRUST_UNUSED_VAR(x1);
-      THRUST_UNUSED_VAR(x2);
-      THRUST_UNUSED_VAR(x3);
-      THRUST_UNUSED_VAR(x4);
-      THRUST_UNUSED_VAR(x5);
-      THRUST_UNUSED_VAR(x6);
-      THRUST_UNUSED_VAR(x7);
-#endif
-      return status;
-    }
-    template <class K, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8>
-    cudaError_t __device__
-    doit_device(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8) const
-    {
-      cudaError_t status = cudaErrorNotSupported;
-#if __THRUST_HAS_CUDART__
-      const size_t size = argument_pack_size(0,x0,x1,x2,x3,x4,x5,x6,x7,x8);
-      void *param_buffer = cudaGetParameterBuffer(64,size);
-      fill_arguments((char*)param_buffer, 0, x0,x1,x2,x3,x4,x5,x6,x7,x8);
-      status = launch_device(k, param_buffer);
-#else
-      THRUST_UNUSED_VAR(k);
-      THRUST_UNUSED_VAR(x0);
-      THRUST_UNUSED_VAR(x1);
-      THRUST_UNUSED_VAR(x2);
-      THRUST_UNUSED_VAR(x3);
-      THRUST_UNUSED_VAR(x4);
-      THRUST_UNUSED_VAR(x5);
-      THRUST_UNUSED_VAR(x6);
-      THRUST_UNUSED_VAR(x7);
-      THRUST_UNUSED_VAR(x8);
-#endif
-      return status;
-    }
-    template <class K, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9>
-    cudaError_t __device__
-    doit_device(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9) const
-    {
-      cudaError_t status = cudaErrorNotSupported;
-#if __THRUST_HAS_CUDART__
-      const size_t size = argument_pack_size(0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9);
-      void *param_buffer = cudaGetParameterBuffer(64,size);
-      fill_arguments((char*)param_buffer, 0, x0,x1,x2,x3,x4,x5,x6,x7,x8,x9);
-      status = launch_device(k, param_buffer);
-#else
-      THRUST_UNUSED_VAR(k);
-      THRUST_UNUSED_VAR(x0);
-      THRUST_UNUSED_VAR(x1);
-      THRUST_UNUSED_VAR(x2);
-      THRUST_UNUSED_VAR(x3);
-      THRUST_UNUSED_VAR(x4);
-      THRUST_UNUSED_VAR(x5);
-      THRUST_UNUSED_VAR(x6);
-      THRUST_UNUSED_VAR(x7);
-      THRUST_UNUSED_VAR(x8);
-      THRUST_UNUSED_VAR(x9);
-#endif
-      return status;
-    }
-    template <class K, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA>
-    cudaError_t __device__
-    doit_device(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA) const
-    {
-      cudaError_t status = cudaErrorNotSupported;
-#if __THRUST_HAS_CUDART__
-      const size_t size = argument_pack_size(0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA);
-      void *param_buffer = cudaGetParameterBuffer(64,size);
-      fill_arguments((char*)param_buffer, 0, x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA);
-      status = launch_device(k, param_buffer);
-#else
-      THRUST_UNUSED_VAR(k);
-      THRUST_UNUSED_VAR(x0);
-      THRUST_UNUSED_VAR(x1);
-      THRUST_UNUSED_VAR(x2);
-      THRUST_UNUSED_VAR(x3);
-      THRUST_UNUSED_VAR(x4);
-      THRUST_UNUSED_VAR(x5);
-      THRUST_UNUSED_VAR(x6);
-      THRUST_UNUSED_VAR(x7);
-      THRUST_UNUSED_VAR(x8);
-      THRUST_UNUSED_VAR(x9);
-      THRUST_UNUSED_VAR(xA);
-#endif
-      return status;
-    }
-    template <class K, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB>
-    cudaError_t __device__
-    doit_device(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB) const
-    {
-      cudaError_t status = cudaErrorNotSupported;
-#if __THRUST_HAS_CUDART__
-      const size_t size = argument_pack_size(0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA,xB);
-      void *param_buffer = cudaGetParameterBuffer(64,size);
-      fill_arguments((char*)param_buffer, 0, x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA,xB);
-      status = launch_device(k, param_buffer);
-#else
-      THRUST_UNUSED_VAR(k);
-      THRUST_UNUSED_VAR(x0);
-      THRUST_UNUSED_VAR(x1);
-      THRUST_UNUSED_VAR(x2);
-      THRUST_UNUSED_VAR(x3);
-      THRUST_UNUSED_VAR(x4);
-      THRUST_UNUSED_VAR(x5);
-      THRUST_UNUSED_VAR(x6);
-      THRUST_UNUSED_VAR(x7);
-      THRUST_UNUSED_VAR(x8);
-      THRUST_UNUSED_VAR(x9);
-      THRUST_UNUSED_VAR(xA);
-      THRUST_UNUSED_VAR(xB);
-#endif
-      return status;
-    }
-    template <class K, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC>
-    cudaError_t __device__
-    doit_device(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC) const
-    {
-      cudaError_t status = cudaErrorNotSupported;
-#if __THRUST_HAS_CUDART__
-      const size_t size = argument_pack_size(0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA,xB,xC);
-      void *param_buffer = cudaGetParameterBuffer(64,size);
-      fill_arguments((char*)param_buffer, 0, x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA,xB,xC);
-      status = launch_device(k, param_buffer);
-#else
-      THRUST_UNUSED_VAR(k);
-      THRUST_UNUSED_VAR(x0);
-      THRUST_UNUSED_VAR(x1);
-      THRUST_UNUSED_VAR(x2);
-      THRUST_UNUSED_VAR(x3);
-      THRUST_UNUSED_VAR(x4);
-      THRUST_UNUSED_VAR(x5);
-      THRUST_UNUSED_VAR(x6);
-      THRUST_UNUSED_VAR(x7);
-      THRUST_UNUSED_VAR(x8);
-      THRUST_UNUSED_VAR(x9);
-      THRUST_UNUSED_VAR(xA);
-      THRUST_UNUSED_VAR(xB);
-      THRUST_UNUSED_VAR(xC);
-#endif
-      return status;
+      return launch_device(k, param_buffer);
     }
-    template <class K, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC, class _xD>
-    cudaError_t __device__
-    doit_device(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC,_xD xD) const
-    {
-      cudaError_t status = cudaErrorNotSupported;
-#if __THRUST_HAS_CUDART__
-      const size_t size = argument_pack_size(0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA,xB,xC,xD);
-      void *param_buffer = cudaGetParameterBuffer(64,size);
-      fill_arguments((char*)param_buffer, 0, x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA,xB,xC,xD);
-      status = launch_device(k, param_buffer);
-#else
-      THRUST_UNUSED_VAR(k);
-      THRUST_UNUSED_VAR(x0);
-      THRUST_UNUSED_VAR(x1);
-      THRUST_UNUSED_VAR(x2);
-      THRUST_UNUSED_VAR(x3);
-      THRUST_UNUSED_VAR(x4);
-      THRUST_UNUSED_VAR(x5);
-      THRUST_UNUSED_VAR(x6);
-      THRUST_UNUSED_VAR(x7);
-      THRUST_UNUSED_VAR(x8);
-      THRUST_UNUSED_VAR(x9);
-      THRUST_UNUSED_VAR(xA);
-      THRUST_UNUSED_VAR(xB);
-      THRUST_UNUSED_VAR(xC);
-      THRUST_UNUSED_VAR(xD);
-#endif
-      return status;
-    }
-    template <class K, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC, class _xD, class _xE>
-    cudaError_t __device__
-    doit_device(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC,_xD xD, _xE xE) const
-    {
-      cudaError_t status = cudaErrorNotSupported;
-#if __THRUST_HAS_CUDART__
-      const size_t size = argument_pack_size(0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA,xB,xC,xD,xE);
-      void *param_buffer = cudaGetParameterBuffer(64,size);
-      fill_arguments((char*)param_buffer, 0, x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA,xB,xC,xD,xE);
-      status = launch_device(k, param_buffer);
-#else
-      THRUST_UNUSED_VAR(k);
-      THRUST_UNUSED_VAR(x0);
-      THRUST_UNUSED_VAR(x1);
-      THRUST_UNUSED_VAR(x2);
-      THRUST_UNUSED_VAR(x3);
-      THRUST_UNUSED_VAR(x4);
-      THRUST_UNUSED_VAR(x5);
-      THRUST_UNUSED_VAR(x6);
-      THRUST_UNUSED_VAR(x7);
-      THRUST_UNUSED_VAR(x8);
-      THRUST_UNUSED_VAR(x9);
-      THRUST_UNUSED_VAR(xA);
-      THRUST_UNUSED_VAR(xB);
-      THRUST_UNUSED_VAR(xC);
-      THRUST_UNUSED_VAR(xD);
-      THRUST_UNUSED_VAR(xE);
-#endif
-      return status;
-    }
-    template <class K, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC, class _xD, class _xE, class _xF>
-    cudaError_t __device__
-    doit_device(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC,_xD xD, _xE xE, _xF xF) const
-    {
-      cudaError_t status = cudaErrorNotSupported;
-#if __THRUST_HAS_CUDART__
-      const size_t size = argument_pack_size(0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA,xB,xC,xD,xE,xF);
-      void *param_buffer = cudaGetParameterBuffer(64,size);
-      fill_arguments((char*)param_buffer, 0, x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,xA,xB,xC,xD,xE,xF);
-      status = launch_device(k, param_buffer);
-#else
-      THRUST_UNUSED_VAR(k);
-      THRUST_UNUSED_VAR(x0);
-      THRUST_UNUSED_VAR(x1);
-      THRUST_UNUSED_VAR(x2);
-      THRUST_UNUSED_VAR(x3);
-      THRUST_UNUSED_VAR(x4);
-      THRUST_UNUSED_VAR(x5);
-      THRUST_UNUSED_VAR(x6);
-      THRUST_UNUSED_VAR(x7);
-      THRUST_UNUSED_VAR(x8);
-      THRUST_UNUSED_VAR(x9);
-      THRUST_UNUSED_VAR(xA);
-      THRUST_UNUSED_VAR(xB);
-      THRUST_UNUSED_VAR(xC);
-      THRUST_UNUSED_VAR(xD);
-      THRUST_UNUSED_VAR(xE);
-      THRUST_UNUSED_VAR(xF);
-#endif
-      return status;
-    }
-#endif /* variadic */
 
     template <class K>
     cudaError_t __device__
     launch_device(K k, void* buffer) const
     {
-#if __THRUST_HAS_CUDART__
       return cudaLaunchDevice((void*)k,
                               buffer,
                               dim3(grid),
                               dim3(block),
                               shared_mem,
                               stream);
-#else
-      THRUST_UNUSED_VAR(k);
-      THRUST_UNUSED_VAR(buffer);
+    }
+    #else 
+    template<class K, class... Args>
+    cudaError_t __device__
+    doit_device(K, Args const&... ) const
+    {
       return cudaErrorNotSupported;
-#endif
     }
+    #endif
 
-
-#ifdef __CUDA_ARCH__
-#define THRUST_TRIPLE_LAUNCHER_HOSTDEVICE doit_device
-#else
-#define THRUST_TRIPLE_LAUNCHER_HOSTDEVICE doit_host
-#endif
-
-#if 0
     __thrust_exec_check_disable__
     template <class K, class... Args>
-    cudaError_t THRUST_FUNCTION
-    doit(K k, Args const&... args) const
+    THRUST_FUNCTION
+    cudaError_t doit(K k, Args const&... args) const
     {
-      return THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(k, args...);
+      NV_IF_TARGET(NV_IS_HOST,
+                   (return doit_host(k, args...);),
+                   (return doit_device(k, args...);));
     }
-#else
-    __thrust_exec_check_disable__
-    template <class K, class _0>
-    cudaError_t THRUST_FUNCTION
-    doit(K k, _0 x0) const
-    {
-      return THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(k, x0);
-    }
-    __thrust_exec_check_disable__
-    template <class K, class _0, class _1>
-    cudaError_t THRUST_FUNCTION
-    doit(K k, _0 x0, _1 x1) const
-    {
-      return THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(k, x0, x1);
-    }
-    __thrust_exec_check_disable__
-    template <class K, class _0, class _1, class _2>
-    cudaError_t THRUST_FUNCTION
-    doit(K k, _0 x0, _1 x1, _2 x2) const
-    {
-      return THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(k, x0, x1, x2);
-    }
-    __thrust_exec_check_disable__
-    template <class K, class _0, class _1, class _2, class _3>
-    cudaError_t THRUST_FUNCTION
-    doit(K k, _0 x0, _1 x1, _2 x2, _3 x3) const
-    {
-      return THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(k, x0, x1, x2, x3);
-    }
-    __thrust_exec_check_disable__
-    template <class K, class _0, class _1, class _2, class _3, class _4>
-    cudaError_t THRUST_FUNCTION
-    doit(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4) const
-    {
-      return THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(k, x0, x1, x2, x3, x4);
-    }
-    __thrust_exec_check_disable__
-    template <class K, class _0, class _1, class _2, class _3, class _4, class _5>
-    cudaError_t THRUST_FUNCTION
-    doit(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5) const
-    {
-      return THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(k, x0, x1, x2, x3, x4, x5);
-    }
-    __thrust_exec_check_disable__
-    template <class K, class _0, class _1, class _2, class _3, class _4, class _5, class _6>
-    cudaError_t THRUST_FUNCTION
-    doit(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6) const
-    {
-      return THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(k, x0, x1, x2, x3, x4, x5, x6);
-    }
-    __thrust_exec_check_disable__
-    template <class K, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7>
-    cudaError_t THRUST_FUNCTION
-    doit(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7) const
-    {
-      return THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(k, x0, x1, x2, x3, x4, x5, x6, x7);
-    }
-    __thrust_exec_check_disable__
-    template <class K, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8>
-    cudaError_t THRUST_FUNCTION
-    doit(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8) const
-    {
-      return THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(k, x0, x1, x2, x3, x4, x5, x6, x7, x8);
-    }
-    __thrust_exec_check_disable__
-    template <class K, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9>
-    cudaError_t THRUST_FUNCTION
-    doit(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9) const
-    {
-      return THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(k, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9);
-    }
-    __thrust_exec_check_disable__
-    template <class K, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA>
-    cudaError_t THRUST_FUNCTION
-    doit(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA) const
-    {
-      return THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(k, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA);
-    }
-    __thrust_exec_check_disable__
-    template <class K, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB>
-    cudaError_t THRUST_FUNCTION
-    doit(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB) const
-    {
-      return THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(k, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB);
-    }
-    __thrust_exec_check_disable__
-    template <class K, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC>
-    cudaError_t THRUST_FUNCTION
-    doit(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC) const
-    {
-      return THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(k, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC);
-    }
-    __thrust_exec_check_disable__
-    template <class K, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC, class _xD>
-    cudaError_t THRUST_FUNCTION
-    doit(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC, _xD xD) const
-    {
-      return THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(k, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, xD);
-    }
-    __thrust_exec_check_disable__
-    template <class K, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC, class _xD, class _xE>
-    cudaError_t THRUST_FUNCTION
-    doit(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC, _xD xD, _xE xE) const
-    {
-      return THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(k, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, xD, xE);
-    }
-    __thrust_exec_check_disable__
-    template <class K, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC, class _xD, class _xE, class _xF>
-    cudaError_t THRUST_FUNCTION
-    doit(K k, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC, _xD xD, _xE xE, _xF xF) const
-    {
-      return THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(k, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, xD, xE, xF);
-    }
-#endif
-#undef THRUST_TRIPLE_LAUNCHER_HOSTDEVICE
+
   }; // struct triple_chevron
 
 }    // namespace launcher
 }    // namespace cuda_
 
-THRUST_END_NS
+THRUST_NAMESPACE_END
diff --git a/thrust/system/cuda/detail/core/util.h b/thrust/system/cuda/detail/core/util.h
index a2c6b88cc..e2f5f8299 100644
--- a/thrust/system/cuda/detail/core/util.h
+++ b/thrust/system/cuda/detail/core/util.h
@@ -26,43 +26,57 @@
  ******************************************************************************/
 #pragma once
 
-#include <cuda_occupancy.h>
 #include <thrust/detail/config.h>
-#include <thrust/system/cuda/config.h>
-#include <thrust/type_traits/is_contiguous_iterator.h>
 #include <thrust/detail/raw_pointer_cast.h>
+#include <thrust/system/cuda/config.h>
 #include <thrust/system/cuda/detail/util.h>
-#include <thrust/system/cuda/detail/cub/block/block_load.cuh>
-#include <thrust/system/cuda/detail/cub/block/block_store.cuh>
-#include <thrust/system/cuda/detail/cub/block/block_scan.cuh>
+#include <thrust/system/system_error.h>
+#include <thrust/type_traits/is_contiguous_iterator.h>
 
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_scan.cuh>
+#include <cub/block/block_store.cuh>
 
-THRUST_BEGIN_NS
+#include <nv/target>
+
+THRUST_NAMESPACE_BEGIN
 
 namespace cuda_cub {
 namespace core {
 
-#if (__CUDA_ARCH__ >= 600)
-#  define THRUST_TUNING_ARCH sm60
-#elif (__CUDA_ARCH__ >= 520)
-#  define THRUST_TUNING_ARCH sm52
-#elif (__CUDA_ARCH__ >= 350)
-#  define THRUST_TUNING_ARCH sm35
-#elif (__CUDA_ARCH__ >= 300)
-#  define THRUST_TUNING_ARCH sm30
-#elif !defined (__CUDA_ARCH__)
-#  define THRUST_TUNING_ARCH sm30
+#ifdef _NVHPC_CUDA
+#  if (__NVCOMPILER_CUDA_ARCH__ >= 600)
+#    define THRUST_TUNING_ARCH sm60
+#  elif (__NVCOMPILER_CUDA_ARCH__ >= 520)
+#    define THRUST_TUNING_ARCH sm52
+#  elif (__NVCOMPILER_CUDA_ARCH__ >= 350)
+#    define THRUST_TUNING_ARCH sm35
+#  else
+#    define THRUST_TUNING_ARCH sm30
+#  endif
+#else
+#  if (__CUDA_ARCH__ >= 600)
+#    define THRUST_TUNING_ARCH sm60
+#  elif (__CUDA_ARCH__ >= 520)
+#    define THRUST_TUNING_ARCH sm52
+#  elif (__CUDA_ARCH__ >= 350)
+#    define THRUST_TUNING_ARCH sm35
+#  elif (__CUDA_ARCH__ >= 300)
+#    define THRUST_TUNING_ARCH sm30
+#  elif !defined (__CUDA_ARCH__)
+#    define THRUST_TUNING_ARCH sm30
+#  endif
 #endif
 
   // Typelist - a container of types, supports up to 10 types
   // --------------------------------------------------------------------------
-  
+
   class _;
   template <class = _, class = _, class = _, class = _, class = _, class = _, class = _, class = _, class = _, class = _>
   struct typelist;
 
   // -------------------------------------
-  
+
   // supported SM arch
   // ---------------------
   struct sm30  { enum { ver = 300, warpSize = 32 }; };
@@ -94,7 +108,7 @@ namespace core {
 
   // metafunction to match next viable PtxPlan specialization
   // --------------------------------------------------------------------------
- 
+
   __THRUST_DEFINE_HAS_NESTED_TYPE(has_tuning_t, tuning)
   __THRUST_DEFINE_HAS_NESTED_TYPE(has_type_t, type)
 
@@ -121,7 +135,7 @@ namespace core {
             template <class, class> class Tuning,
             class _0>
   struct has_sm_tuning_impl<SM, Tuning<lowest_supported_sm_arch, _0> > : has_type_t<Tuning<SM, _0> > {};
-  
+
   // specializing for Tunig which needs 2 args
   template <class SM,
             template <class, class,class> class Tuning,
@@ -131,9 +145,9 @@ namespace core {
   template <template <class> class P, class SM>
   struct has_sm_tuning : has_sm_tuning_impl<SM, typename P<lowest_supported_sm_arch>::tuning > {};
 
-  // once first match is found in sm_list, all remaining sm are possible 
+  // once first match is found in sm_list, all remaining sm are possible
   // candidate for tuning, so pick the first available
-  //   if the plan P has SM-level tuning then pick it, 
+  //   if the plan P has SM-level tuning then pick it,
   //   otherwise move on to the next sm in the sm_list
   template <template <class> class P, class SM, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9>
   struct specialize_plan_impl_match<P, typelist<SM, _1, _2, _3, _4, _5, _6, _7, _8, _9> >
@@ -146,14 +160,14 @@ namespace core {
     struct specialize_plan_msvc10_war
     {
       // if Plan has tuning type, this means it has SM-specific tuning
-      // so loop through sm_list to find match, 
+      // so loop through sm_list to find match,
       // otherwise just specialize on provided SM
       typedef thrust::detail::conditional<has_tuning_t<Plan<lowest_supported_sm_arch> >::value,
                                   specialize_plan_impl_loop<Plan, SM, sm_list>,
                                   Plan<SM> >
           type;
     };
-    
+
     template <template <class> class Plan, class SM = THRUST_TUNING_ARCH>
     struct specialize_plan : specialize_plan_msvc10_war<Plan,SM>::type::type {};
 
@@ -339,17 +353,18 @@ namespace core {
     };
 
     template <class Agent>
-    typename get_plan<Agent>::type THRUST_RUNTIME_FUNCTION
-    get_agent_plan(int ptx_version)
+    THRUST_RUNTIME_FUNCTION
+    typename get_plan<Agent>::type get_agent_plan(int ptx_version)
     {
-#if (CUB_PTX_ARCH > 0) && defined(__THRUST_HAS_CUDART__)
-      typedef typename get_plan<Agent>::type Plan;
-      THRUST_UNUSED_VAR(ptx_version);
-      // We're on device, use default policy
-      return Plan(typename Agent::ptx_plan());
-#else
-      return get_agent_plan_impl<Agent, sm_list>::get(ptx_version);
-#endif
+      NV_IF_TARGET(
+        NV_IS_DEVICE,
+        (
+          THRUST_UNUSED_VAR(ptx_version);
+          using plan_type = typename get_plan<Agent>::type;
+          using ptx_plan  = typename Agent::ptx_plan;
+          return plan_type{ptx_plan{}};
+        ), // NV_IS_HOST:
+        ( return get_agent_plan_impl<Agent, sm_list>::get(ptx_version); ));
     }
 
 // XXX keep this dead-code for now as a gentle reminder
@@ -433,67 +448,12 @@ namespace core {
   /////////////////////////
   /////////////////////////
 
-  inline cudaError_t CUB_RUNTIME_FUNCTION
-  get_occ_device_properties(cudaOccDeviceProp &occ_prop, int dev_id)
-  {
-    cudaError_t status = cudaSuccess;
-#ifdef __CUDA_ARCH__
-    {
-      cudaOccDeviceProp &o = occ_prop;
-      //
-      status = cudaDeviceGetAttribute(&o.computeMajor,
-                                      cudaDevAttrComputeCapabilityMajor,
-                                      dev_id);
-      status = cudaDeviceGetAttribute(&o.computeMinor,
-                                      cudaDevAttrComputeCapabilityMinor,
-                                      dev_id);
-      status = cudaDeviceGetAttribute(&o.maxThreadsPerBlock,
-                                      cudaDevAttrMaxThreadsPerBlock,
-                                      dev_id);
-      status = cudaDeviceGetAttribute(&o.maxThreadsPerMultiprocessor,
-                                      cudaDevAttrMaxThreadsPerMultiProcessor,
-                                      dev_id);
-      status = cudaDeviceGetAttribute(&o.regsPerBlock,
-                                      cudaDevAttrMaxRegistersPerBlock,
-                                      dev_id);
-      status = cudaDeviceGetAttribute(&o.regsPerMultiprocessor,
-                                      cudaDevAttrMaxRegistersPerMultiprocessor,
-                                      dev_id);
-      status = cudaDeviceGetAttribute(&o.warpSize,
-                                      cudaDevAttrWarpSize,
-                                      dev_id);
-
-      int i32value;
-      status = cudaDeviceGetAttribute(&i32value,
-                                      cudaDevAttrMaxSharedMemoryPerBlock,
-                                      dev_id);
-      o.sharedMemPerBlock = static_cast<size_t>(i32value);
-
-      status = cudaDeviceGetAttribute(&i32value,
-                                      cudaDevAttrMaxSharedMemoryPerMultiprocessor,
-                                      dev_id);
-      o.sharedMemPerMultiprocessor = static_cast<size_t>(i32value);
-
-      status = cudaDeviceGetAttribute(&o.numSms,
-                                      cudaDevAttrMultiProcessorCount,
-                                      dev_id);
-    }
-#else
-    {
-      cudaDeviceProp props;
-      status   = cudaGetDeviceProperties(&props, dev_id);
-      occ_prop = cudaOccDeviceProp(props);
-    }
-#endif
-    return status;
-  }
-  
-  int CUB_RUNTIME_FUNCTION
-  inline get_sm_count()
+  THRUST_RUNTIME_FUNCTION
+  inline int get_sm_count()
   {
     int dev_id;
     cuda_cub::throw_on_error(cudaGetDevice(&dev_id),
-                             "get_sm_count:"
+                             "get_sm_count :"
                              "failed to cudaGetDevice");
 
     cudaError_t status;
@@ -507,8 +467,8 @@ namespace core {
     return i32value;
   }
 
-  size_t CUB_RUNTIME_FUNCTION
-  inline get_max_shared_memory_per_block()
+  THRUST_RUNTIME_FUNCTION
+  inline size_t get_max_shared_memory_per_block()
   {
     int dev_id;
     cuda_cub::throw_on_error(cudaGetDevice(&dev_id),
@@ -527,8 +487,8 @@ namespace core {
     return static_cast<size_t>(i32value);
   }
 
-  size_t CUB_RUNTIME_FUNCTION
-  inline virtual_shmem_size(size_t shmem_per_block)
+  THRUST_RUNTIME_FUNCTION
+  inline size_t virtual_shmem_size(size_t shmem_per_block)
   {
     size_t max_shmem_per_block = core::get_max_shared_memory_per_block();
     if (shmem_per_block > max_shmem_per_block)
@@ -536,9 +496,9 @@ namespace core {
     else
       return 0;
   }
-  
-  size_t CUB_RUNTIME_FUNCTION
-  inline vshmem_size(size_t shmem_per_block, size_t num_blocks)
+
+  THRUST_RUNTIME_FUNCTION
+  inline size_t vshmem_size(size_t shmem_per_block, size_t num_blocks)
   {
     size_t max_shmem_per_block = core::get_max_shared_memory_per_block();
     if (shmem_per_block > max_shmem_per_block)
@@ -547,51 +507,6 @@ namespace core {
       return 0;
   }
 
-  template <class Kernel>
-  int CUB_RUNTIME_FUNCTION 
-  get_max_block_size(Kernel k)
-  {
-    int devId;
-    cuda_cub::throw_on_error(cudaGetDevice(&devId),
-                   "get_max_block_size :"
-                   "failed to cudaGetDevice");
-
-    cudaOccDeviceProp occ_prop;
-    cuda_cub::throw_on_error(get_occ_device_properties(occ_prop, devId),
-                   "get_max_block_size: "
-                   "failed to cudaGetDeviceProperties");
-
-
-    cudaFuncAttributes attribs;
-    cuda_cub::throw_on_error(cudaFuncGetAttributes(&attribs, reinterpret_cast<void *>(k)),
-                   "get_max_block_size: "
-                   "failed to cudaFuncGetAttributes");
-    cudaOccFuncAttributes occ_attrib(attribs);
-
-
-    cudaFuncCache cacheConfig;
-    cuda_cub::throw_on_error(cudaDeviceGetCacheConfig(&cacheConfig),
-                   "get_max_block_size: "
-                   "failed to cudaDeviceGetCacheConfig");
-
-    cudaOccDeviceState occ_state;
-    occ_state.cacheConfig      = (cudaOccCacheConfig)cacheConfig;
-    int          block_size    = 0;
-    int          min_grid_size = 0;
-    cudaOccError occ_status    = cudaOccMaxPotentialOccupancyBlockSize(&min_grid_size,
-                                                                    &block_size,
-                                                                    &occ_prop,
-                                                                    &occ_attrib,
-                                                                    &occ_state,
-                                                                    0);
-    if (CUDA_OCC_SUCCESS != occ_status || block_size <= 0)
-      cuda_cub::throw_on_error(cudaErrorInvalidConfiguration,
-                     "get_max_block_size: "
-                     "failed to cudaOccMaxPotentialOccupancyBlockSize");
-
-    return block_size;
-  }
-  
   // LoadIterator
   // ------------
   // if trivial iterator is passed, wrap loads into LDG
@@ -607,7 +522,7 @@ namespace core {
         cub::CacheModifiedInputIterator<PtxPlan::LOAD_MODIFIER,
                                         value_type,
                                         size_type>,
-        It>::type type;
+                                        It>::type type;
   };    // struct Iterator
 
   template <class PtxPlan, class It>
@@ -616,7 +531,7 @@ namespace core {
   {
     return raw_pointer_cast(&*it);
   }
-  
+
   template <class PtxPlan, class It>
   typename LoadIterator<PtxPlan, It>::type __device__ __forceinline__
   make_load_iterator_impl(It it, thrust::detail::false_type /* is_trivial */)
@@ -646,18 +561,15 @@ namespace core {
             class T    = typename iterator_traits<It>::value_type>
   struct BlockLoad
   {
-    typedef cub::BlockLoad<T,
-                           PtxPlan::BLOCK_THREADS,
-                           PtxPlan::ITEMS_PER_THREAD,
-                           PtxPlan::LOAD_ALGORITHM,
-                           1,
-                           1,
-                           get_arch<PtxPlan>::type::ver>
-
-
-        type;
+    using type = cub::BlockLoad<T,
+                                PtxPlan::BLOCK_THREADS,
+                                PtxPlan::ITEMS_PER_THREAD,
+                                PtxPlan::LOAD_ALGORITHM,
+                                1,
+                                1,
+                                get_arch<PtxPlan>::type::ver>;
   };
-  
+
   // BlockStore
   // -----------
   // a helper metaprogram that returns type of a block loader
@@ -666,28 +578,27 @@ namespace core {
             class T = typename iterator_traits<It>::value_type>
   struct BlockStore
   {
-    typedef cub::BlockStore<T,
-                            PtxPlan::BLOCK_THREADS,
-                            PtxPlan::ITEMS_PER_THREAD,
-                            PtxPlan::STORE_ALGORITHM,
-                            1,
-                            1,
-                            get_arch<PtxPlan>::type::ver>
-        type;
+    using type = cub::BlockStore<T,
+                                 PtxPlan::BLOCK_THREADS,
+                                 PtxPlan::ITEMS_PER_THREAD,
+                                 PtxPlan::STORE_ALGORITHM,
+                                 1,
+                                 1,
+                                 get_arch<PtxPlan>::type::ver>;
   };
-  // cuda_otional
+
+  // cuda_optional
   // --------------
   // used for function that return cudaError_t along with the result
   //
   template <class T>
   class cuda_optional
   {
-    cudaError_t status_;
-    T           value_;
+    cudaError_t status_{cudaSuccess};
+    T           value_{};
 
   public:
-    __host__ __device__
-    cuda_optional() : status_(cudaSuccess) {}
+    cuda_optional() = default;
 
     __host__ __device__
     cuda_optional(T v, cudaError_t status = cudaSuccess) : status_(status), value_(v) {}
@@ -704,16 +615,62 @@ namespace core {
     __host__ __device__ operator T const &() const { return value_; }
   };
 
-  inline cuda_optional<int> CUB_RUNTIME_FUNCTION
-  get_ptx_version()
+  THRUST_RUNTIME_FUNCTION
+  inline int get_ptx_version()
   {
     int ptx_version = 0;
-    cudaError_t status = cub::PtxVersion(ptx_version);
-    return cuda_optional<int>(ptx_version, status);
+    if (cub::PtxVersion(ptx_version) != cudaSuccess) 
+    {
+      // Failure might mean that there's no device found
+      const int current_device = cub::CurrentDevice();
+      if (current_device < 0)
+      {
+        cuda_cub::throw_on_error(cudaErrorNoDevice, "No GPU is available\n");
+      }
+
+      // Any subsequent failure means the provided device binary does not match 
+      // the generated function code
+      int major = 0, minor = 0;
+      cudaError_t attr_status;
+
+      attr_status = cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, current_device);
+      cuda_cub::throw_on_error(attr_status,
+                              "get_ptx_version :"
+                              "failed to get major CUDA device compute capability version.");
+
+      attr_status = cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, current_device);
+      cuda_cub::throw_on_error(attr_status,
+                              "get_ptx_version :"
+                              "failed to get minor CUDA device compute capability version.");
+        
+      // Index from which SM code has to start in the message below
+      int code_offset = 37;
+      char str[] = "This program was not compiled for SM     \n";
+
+      auto print_1_helper = [&](int v) {
+        str[code_offset] = static_cast<char>(v) + '0';
+        code_offset++;
+      };
+
+      // Assume two digits will be enough
+      auto print_2_helper = [&](int v) {
+        if (v / 10 != 0) {
+          print_1_helper(v / 10);
+        }
+        print_1_helper(v % 10);
+      };
+
+      print_2_helper(major);
+      print_2_helper(minor);
+
+      cuda_cub::throw_on_error(cudaErrorInvalidDevice, str);
+    }
+
+    return ptx_version;
   }
 
-  inline cudaError_t CUB_RUNTIME_FUNCTION
-  sync_stream(cudaStream_t stream)
+  THRUST_RUNTIME_FUNCTION
+  inline cudaError_t sync_stream(cudaStream_t stream)
   {
     return cub::SyncStream(stream);
   }
@@ -724,7 +681,10 @@ namespace core {
   }
 
 #define CUDA_CUB_RET_IF_FAIL(e) \
-  if (thrust::cuda_cub::cub::Debug((e), __FILE__, __LINE__)) return e;
+  {                             \
+    auto const error = (e);     \
+    if (cub::Debug(error, __FILE__, __LINE__)) return error; \
+  }
 
   // uninitialized
   // -------
@@ -749,7 +709,7 @@ namespace core {
 
     __host__ __device__ __forceinline__ operator T&() { return get(); }
   };
-  
+
   // uninitialized_array
   // --------------
   // allocates uninitialized data on stack
@@ -837,6 +797,6 @@ using core::sm60;
 using core::sm52;
 using core::sm35;
 using core::sm30;
-} // namespace cuda_ 
+} // namespace cuda_
 
-THRUST_END_NS
+THRUST_NAMESPACE_END
diff --git a/thrust/system/cuda/detail/count.h b/thrust/system/cuda/detail/count.h
index 2ed68d7e7..b624f39dc 100644
--- a/thrust/system/cuda/detail/count.h
+++ b/thrust/system/cuda/detail/count.h
@@ -26,6 +26,7 @@
  ******************************************************************************/
 #pragma once
 
+#include <thrust/detail/config.h>
 
 #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
 #include <thrust/system/cuda/config.h>
@@ -34,7 +35,7 @@
 #include <thrust/system/cuda/detail/reduce.h>
 #include <thrust/distance.h>
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
 namespace cuda_cub {
 
 template <class Derived,
@@ -75,5 +76,5 @@ count(execution_policy<Derived> &policy,
 }
 
 } // namespace cuda_cub
-THRUST_END_NS
+THRUST_NAMESPACE_END
 #endif
diff --git a/thrust/system/cuda/detail/cross_system.h b/thrust/system/cuda/detail/cross_system.h
index 56a20daa2..039531d28 100644
--- a/thrust/system/cuda/detail/cross_system.h
+++ b/thrust/system/cuda/detail/cross_system.h
@@ -26,11 +26,13 @@
  ******************************************************************************/
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/system/cuda/detail/guarded_cuda_runtime_api.h>
 #include <thrust/system/cpp/detail/execution_policy.h>
 #include <thrust/system/cuda/detail/execution_policy.h>
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
 namespace cuda_cub {
 
   template <class Sys1, class Sys2>
@@ -55,7 +57,7 @@ namespace cuda_cub {
 #if THRUST_CPP_DIALECT >= 2011
   // Device to host.
   template <class Sys1, class Sys2>
-  THRUST_CONSTEXPR __host__ __device__ 
+  constexpr __host__ __device__
   auto direction_of_copy(
     thrust::system::cuda::execution_policy<Sys1> const&
   , thrust::cpp::execution_policy<Sys2> const&
@@ -68,7 +70,7 @@ namespace cuda_cub {
 
   // Host to device.
   template <class Sys1, class Sys2>
-  THRUST_CONSTEXPR __host__ __device__
+  constexpr __host__ __device__
   auto direction_of_copy(
     thrust::cpp::execution_policy<Sys1> const&
   , thrust::system::cuda::execution_policy<Sys2> const&
@@ -81,7 +83,7 @@ namespace cuda_cub {
 
   // Device to device.
   template <class Sys1, class Sys2>
-  THRUST_CONSTEXPR __host__ __device__
+  constexpr __host__ __device__
   auto direction_of_copy(
     thrust::system::cuda::execution_policy<Sys1> const&
   , thrust::system::cuda::execution_policy<Sys2> const&
@@ -94,7 +96,7 @@ namespace cuda_cub {
 
   // Device to device.
   template <class DerivedPolicy>
-  THRUST_CONSTEXPR __host__ __device__ 
+  constexpr __host__ __device__
   auto direction_of_copy(execution_policy<DerivedPolicy> const &)
   THRUST_DECLTYPE_RETURNS(
     thrust::detail::integral_constant<
@@ -103,7 +105,7 @@ namespace cuda_cub {
   )
 
   template <class Sys1, class Sys2>
-  THRUST_CONSTEXPR __host__ __device__
+  constexpr __host__ __device__
   auto direction_of_copy(
     execution_policy<cross_system<Sys1, Sys2>> const &systems
   )
@@ -114,89 +116,95 @@ namespace cuda_cub {
     )
   )
 
-  template <typename ExecutionPolicy0, typename ExecutionPolicy1>
-  THRUST_CONSTEXPR __host__ __device__
-  auto is_device_to_host_copy(
+  template <typename ExecutionPolicy0,
+            typename ExecutionPolicy1,
+            // MSVC2015 WAR: put decltype here instead of in trailing return type
+            typename Direction =
+              decltype(direction_of_copy(std::declval<ExecutionPolicy0>(),
+                                         std::declval<ExecutionPolicy1>()))>
+  constexpr __host__ __device__
+  thrust::detail::integral_constant<
+    bool, cudaMemcpyDeviceToHost == Direction::value
+  >
+  is_device_to_host_copy(
     ExecutionPolicy0 const& exec0
   , ExecutionPolicy1 const& exec1
-  )
-    noexcept -> 
-      thrust::detail::integral_constant<
-        bool
-      ,    cudaMemcpyDeviceToHost
-        == decltype(direction_of_copy(exec0, exec1))::value
-      >
+  ) noexcept
   {
     return {};
   }
 
-  template <typename ExecutionPolicy>
-  THRUST_CONSTEXPR __host__ __device__
-  auto is_device_to_host_copy(ExecutionPolicy const& exec)
-    noexcept -> 
-      thrust::detail::integral_constant<
-        bool
-      ,    cudaMemcpyDeviceToHost
-        == decltype(direction_of_copy(exec))::value
-      >
+  template <typename ExecutionPolicy,
+            // MSVC2015 WAR: put decltype here instead of in trailing return type
+            typename Direction =
+              decltype(direction_of_copy(std::declval<ExecutionPolicy>()))>
+  constexpr __host__ __device__
+  thrust::detail::integral_constant<
+    bool, cudaMemcpyDeviceToHost == Direction::value
+  >
+  is_device_to_host_copy(ExecutionPolicy const& exec) noexcept
   {
     return {};
   }
 
-  template <typename ExecutionPolicy0, typename ExecutionPolicy1>
-  THRUST_CONSTEXPR __host__ __device__
-  auto is_host_to_device_copy(
+  template <typename ExecutionPolicy0,
+            typename ExecutionPolicy1,
+            // MSVC2015 WAR: put decltype here instead of in trailing return type
+            typename Direction =
+              decltype(direction_of_copy(std::declval<ExecutionPolicy0>(),
+                                         std::declval<ExecutionPolicy1>()))>
+  constexpr __host__ __device__
+  thrust::detail::integral_constant<
+    bool, cudaMemcpyHostToDevice == Direction::value
+  >
+  is_host_to_device_copy(
     ExecutionPolicy0 const& exec0
   , ExecutionPolicy1 const& exec1
-  )
-    noexcept -> 
-      thrust::detail::integral_constant<
-        bool
-      ,    cudaMemcpyHostToDevice
-        == decltype(direction_of_copy(exec0, exec1))::value
-      >
+  ) noexcept
   {
     return {};
   }
 
-  template <typename ExecutionPolicy>
-  THRUST_CONSTEXPR __host__ __device__
-  auto is_host_to_device_copy(ExecutionPolicy const& exec)
-    noexcept -> 
-      thrust::detail::integral_constant<
-        bool
-      ,    cudaMemcpyHostToDevice
-        == decltype(direction_of_copy(exec))::value
-      >
+  template <typename ExecutionPolicy,
+            // MSVC2015 WAR: put decltype here instead of in trailing return type
+            typename Direction =
+              decltype(direction_of_copy(std::declval<ExecutionPolicy>()))>
+  constexpr __host__ __device__
+  thrust::detail::integral_constant<
+    bool, cudaMemcpyHostToDevice == Direction::value
+  >
+  is_host_to_device_copy(ExecutionPolicy const& exec) noexcept
   {
     return {};
   }
 
-  template <typename ExecutionPolicy0, typename ExecutionPolicy1>
-  THRUST_CONSTEXPR __host__ __device__
-  auto is_device_to_device_copy(
+  template <typename ExecutionPolicy0,
+            typename ExecutionPolicy1,
+            // MSVC2015 WAR: put decltype here instead of in trailing return type
+            typename Direction =
+              decltype(direction_of_copy(std::declval<ExecutionPolicy0>(),
+                                         std::declval<ExecutionPolicy1>()))>
+  constexpr __host__ __device__
+  thrust::detail::integral_constant<
+    bool, cudaMemcpyDeviceToDevice == Direction::value
+  >
+  is_device_to_device_copy(
     ExecutionPolicy0 const& exec0
   , ExecutionPolicy1 const& exec1
-  )
-    noexcept -> 
-      thrust::detail::integral_constant<
-        bool
-      ,    cudaMemcpyDeviceToDevice
-        == decltype(direction_of_copy(exec0, exec1))::value
-      >
+  ) noexcept
   {
     return {};
   }
 
-  template <typename ExecutionPolicy>
-  THRUST_CONSTEXPR __host__ __device__
-  auto is_device_to_device_copy(ExecutionPolicy const& exec)
-    noexcept -> 
-      thrust::detail::integral_constant<
-        bool
-      ,    cudaMemcpyDeviceToDevice
-        == decltype(direction_of_copy(exec))::value
-      >
+  template <typename ExecutionPolicy,
+            // MSVC2015 WAR: put decltype here instead of in trailing return type
+            typename Direction =
+              decltype(direction_of_copy(std::declval<ExecutionPolicy>()))>
+  constexpr __host__ __device__
+  thrust::detail::integral_constant<
+    bool, cudaMemcpyDeviceToDevice == Direction::value
+  >
+  is_device_to_device_copy(ExecutionPolicy const& exec) noexcept
   {
     return {};
   }
@@ -327,5 +335,5 @@ namespace cuda_cub {
   }
 
 } // namespace cuda_cub
-THRUST_END_NS
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/cuda/detail/cub/agent/agent_histogram.cuh b/thrust/system/cuda/detail/cub/agent/agent_histogram.cuh
deleted file mode 100644
index 0833ed31b..000000000
--- a/thrust/system/cuda/detail/cub/agent/agent_histogram.cuh
+++ /dev/null
@@ -1,787 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::AgentHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wide histogram .
- */
-
-#pragma once
-
-#include <iterator>
-
-#include "../util_type.cuh"
-#include "../block/block_load.cuh"
-#include "../grid/grid_queue.cuh"
-#include "../iterator/cache_modified_input_iterator.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Tuning policy
- ******************************************************************************/
-
-/**
- *
- */
-enum BlockHistogramMemoryPreference
-{
-    GMEM,
-    SMEM,
-    BLEND
-};
-
-
-/**
- * Parameterizable tuning policy type for AgentHistogram
- */
-template <
-    int                             _BLOCK_THREADS,                 ///< Threads per thread block
-    int                             _PIXELS_PER_THREAD,             ///< Pixels per thread (per tile of input)
-    BlockLoadAlgorithm              _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
-    CacheLoadModifier               _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
-    bool                            _RLE_COMPRESS,                  ///< Whether to perform localized RLE to compress samples before histogramming
-    BlockHistogramMemoryPreference  _MEM_PREFERENCE,                ///< Whether to prefer privatized shared-memory bins (versus privatized global-memory bins)
-    bool                            _WORK_STEALING>                 ///< Whether to dequeue tiles from a global work queue
-struct AgentHistogramPolicy
-{
-    enum
-    {
-        BLOCK_THREADS           = _BLOCK_THREADS,                   ///< Threads per thread block
-        PIXELS_PER_THREAD       = _PIXELS_PER_THREAD,               ///< Pixels per thread (per tile of input)
-        IS_RLE_COMPRESS         = _RLE_COMPRESS,                    ///< Whether to perform localized RLE to compress samples before histogramming
-        MEM_PREFERENCE          = _MEM_PREFERENCE,                  ///< Whether to prefer privatized shared-memory bins (versus privatized global-memory bins)
-        IS_WORK_STEALING        = _WORK_STEALING,                   ///< Whether to dequeue tiles from a global work queue
-    };
-
-    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;          ///< The BlockLoad algorithm to use
-    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;           ///< Cache load modifier for reading input elements
-};
-
-
-/******************************************************************************
- * Thread block abstractions
- ******************************************************************************/
-
-/**
- * \brief AgentHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wide histogram .
- */
-template <
-    typename    AgentHistogramPolicyT,     ///< Parameterized AgentHistogramPolicy tuning policy type
-    int         PRIVATIZED_SMEM_BINS,           ///< Number of privatized shared-memory histogram bins of any channel.  Zero indicates privatized counters to be maintained in device-accessible memory.
-    int         NUM_CHANNELS,                   ///< Number of channels interleaved in the input data.  Supports up to four channels.
-    int         NUM_ACTIVE_CHANNELS,            ///< Number of channels actively being histogrammed
-    typename    SampleIteratorT,                ///< Random-access input iterator type for reading samples
-    typename    CounterT,                       ///< Integer type for counting sample occurrences per histogram bin
-    typename    PrivatizedDecodeOpT,            ///< The transform operator type for determining privatized counter indices from samples, one for each channel
-    typename    OutputDecodeOpT,                ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel
-    typename    OffsetT,                        ///< Signed integer type for global offsets
-    int         PTX_ARCH = CUB_PTX_ARCH>        ///< PTX compute capability
-struct AgentHistogram
-{
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    /// The sample type of the input iterator
-    typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
-
-    /// The pixel type of SampleT
-    typedef typename CubVector<SampleT, NUM_CHANNELS>::Type PixelT;
-
-    /// The quad type of SampleT
-    typedef typename CubVector<SampleT, 4>::Type QuadT;
-
-    /// Constants
-    enum
-    {
-        BLOCK_THREADS           = AgentHistogramPolicyT::BLOCK_THREADS,
-
-        PIXELS_PER_THREAD       = AgentHistogramPolicyT::PIXELS_PER_THREAD,
-        SAMPLES_PER_THREAD      = PIXELS_PER_THREAD * NUM_CHANNELS,
-        QUADS_PER_THREAD        = SAMPLES_PER_THREAD / 4,
-
-        TILE_PIXELS             = PIXELS_PER_THREAD * BLOCK_THREADS,
-        TILE_SAMPLES            = SAMPLES_PER_THREAD * BLOCK_THREADS,
-
-        IS_RLE_COMPRESS            = AgentHistogramPolicyT::IS_RLE_COMPRESS,
-
-        MEM_PREFERENCE          = (PRIVATIZED_SMEM_BINS > 0) ?
-                                        AgentHistogramPolicyT::MEM_PREFERENCE :
-                                        GMEM,
-
-        IS_WORK_STEALING           = AgentHistogramPolicyT::IS_WORK_STEALING,
-    };
-
-    /// Cache load modifier for reading input elements
-    static const CacheLoadModifier LOAD_MODIFIER = AgentHistogramPolicyT::LOAD_MODIFIER;
-
-
-    /// Input iterator wrapper type (for applying cache modifier)
-    typedef typename If<IsPointer<SampleIteratorT>::VALUE,
-            CacheModifiedInputIterator<LOAD_MODIFIER, SampleT, OffsetT>,     // Wrap the native input pointer with CacheModifiedInputIterator
-            SampleIteratorT>::Type                                           // Directly use the supplied input iterator type
-        WrappedSampleIteratorT;
-
-    /// Pixel input iterator type (for applying cache modifier)
-    typedef CacheModifiedInputIterator<LOAD_MODIFIER, PixelT, OffsetT>
-        WrappedPixelIteratorT;
-
-    /// Qaud input iterator type (for applying cache modifier)
-    typedef CacheModifiedInputIterator<LOAD_MODIFIER, QuadT, OffsetT>
-        WrappedQuadIteratorT;
-
-    /// Parameterized BlockLoad type for samples
-    typedef BlockLoad<
-            SampleT,
-            BLOCK_THREADS,
-            SAMPLES_PER_THREAD,
-            AgentHistogramPolicyT::LOAD_ALGORITHM>
-        BlockLoadSampleT;
-
-    /// Parameterized BlockLoad type for pixels
-    typedef BlockLoad<
-            PixelT,
-            BLOCK_THREADS,
-            PIXELS_PER_THREAD,
-            AgentHistogramPolicyT::LOAD_ALGORITHM>
-        BlockLoadPixelT;
-
-    /// Parameterized BlockLoad type for quads
-    typedef BlockLoad<
-            QuadT,
-            BLOCK_THREADS,
-            QUADS_PER_THREAD,
-            AgentHistogramPolicyT::LOAD_ALGORITHM>
-        BlockLoadQuadT;
-
-    /// Shared memory type required by this thread block
-    struct _TempStorage
-    {
-        CounterT histograms[NUM_ACTIVE_CHANNELS][PRIVATIZED_SMEM_BINS + 1];     // Smem needed for block-privatized smem histogram (with 1 word of padding)
-
-        int tile_idx;
-
-        // Aliasable storage layout
-        union Aliasable
-        {
-            typename BlockLoadSampleT::TempStorage sample_load;     // Smem needed for loading a tile of samples
-            typename BlockLoadPixelT::TempStorage pixel_load;       // Smem needed for loading a tile of pixels
-            typename BlockLoadQuadT::TempStorage quad_load;         // Smem needed for loading a tile of quads
-
-        } aliasable;
-    };
-
-
-    /// Temporary storage type (unionable)
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    //---------------------------------------------------------------------
-    // Per-thread fields
-    //---------------------------------------------------------------------
-
-    /// Reference to temp_storage
-    _TempStorage &temp_storage;
-
-    /// Sample input iterator (with cache modifier applied, if possible)
-    WrappedSampleIteratorT d_wrapped_samples;
-
-    /// Native pointer for input samples (possibly NULL if unavailable)
-    SampleT* d_native_samples;
-
-    /// The number of output bins for each channel
-    int (&num_output_bins)[NUM_ACTIVE_CHANNELS];
-
-    /// The number of privatized bins for each channel
-    int (&num_privatized_bins)[NUM_ACTIVE_CHANNELS];
-
-    /// Reference to gmem privatized histograms for each channel
-    CounterT* d_privatized_histograms[NUM_ACTIVE_CHANNELS];
-
-    /// Reference to final output histograms (gmem)
-    CounterT* (&d_output_histograms)[NUM_ACTIVE_CHANNELS];
-
-    /// The transform operator for determining output bin-ids from privatized counter indices, one for each channel
-    OutputDecodeOpT (&output_decode_op)[NUM_ACTIVE_CHANNELS];
-
-    /// The transform operator for determining privatized counter indices from samples, one for each channel
-    PrivatizedDecodeOpT (&privatized_decode_op)[NUM_ACTIVE_CHANNELS];
-
-    /// Whether to prefer privatized smem counters vs privatized global counters
-    bool prefer_smem;
-
-
-    //---------------------------------------------------------------------
-    // Initialize privatized bin counters
-    //---------------------------------------------------------------------
-
-    // Initialize privatized bin counters
-    __device__ __forceinline__ void InitBinCounters(CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS])
-    {
-        // Initialize histogram bin counts to zeros
-        #pragma unroll
-        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
-        {
-            for (int privatized_bin = threadIdx.x; privatized_bin < num_privatized_bins[CHANNEL]; privatized_bin += BLOCK_THREADS)
-            {
-                privatized_histograms[CHANNEL][privatized_bin] = 0;
-            }
-        }
-
-        // Barrier to make sure all threads are done updating counters
-        CTA_SYNC();
-    }
-
-
-    // Initialize privatized bin counters.  Specialized for privatized shared-memory counters
-    __device__ __forceinline__ void InitSmemBinCounters()
-    {
-        CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];
-
-        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
-            privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL];
-
-        InitBinCounters(privatized_histograms);
-    }
-
-
-    // Initialize privatized bin counters.  Specialized for privatized global-memory counters
-    __device__ __forceinline__ void InitGmemBinCounters()
-    {
-        InitBinCounters(d_privatized_histograms);
-    }
-
-
-    //---------------------------------------------------------------------
-    // Update final output histograms
-    //---------------------------------------------------------------------
-
-    // Update final output histograms from privatized histograms
-    __device__ __forceinline__ void StoreOutput(CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS])
-    {
-        // Barrier to make sure all threads are done updating counters
-        CTA_SYNC();
-
-        // Apply privatized bin counts to output bin counts
-        #pragma unroll
-        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
-        {
-            int channel_bins = num_privatized_bins[CHANNEL];
-            for (int privatized_bin = threadIdx.x; 
-                    privatized_bin < channel_bins;  
-                    privatized_bin += BLOCK_THREADS)
-            {
-                int         output_bin  = -1;
-                CounterT    count       = privatized_histograms[CHANNEL][privatized_bin];
-                bool        is_valid    = count > 0;
-
-                output_decode_op[CHANNEL].template BinSelect<LOAD_MODIFIER>((SampleT) privatized_bin, output_bin, is_valid);
-
-                if (output_bin >= 0)
-                {
-                    atomicAdd(&d_output_histograms[CHANNEL][output_bin], count);
-                }
-
-            }
-        }
-    }
-
-
-    // Update final output histograms from privatized histograms.  Specialized for privatized shared-memory counters
-    __device__ __forceinline__ void StoreSmemOutput()
-    {
-        CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];
-        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
-            privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL];
-
-        StoreOutput(privatized_histograms);
-    }
-
-
-    // Update final output histograms from privatized histograms.  Specialized for privatized global-memory counters
-    __device__ __forceinline__ void StoreGmemOutput()
-    {
-        StoreOutput(d_privatized_histograms);
-    }
-
-
-    //---------------------------------------------------------------------
-    // Tile accumulation
-    //---------------------------------------------------------------------
-
-    // Accumulate pixels.  Specialized for RLE compression.
-    __device__ __forceinline__ void AccumulatePixels(
-        SampleT             samples[PIXELS_PER_THREAD][NUM_CHANNELS],
-        bool                is_valid[PIXELS_PER_THREAD],
-        CounterT*           privatized_histograms[NUM_ACTIVE_CHANNELS],
-        Int2Type<true>      is_rle_compress)
-    {
-        #pragma unroll
-        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
-        {
-            // Bin pixels
-            int bins[PIXELS_PER_THREAD];
-
-            #pragma unroll
-            for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
-            {
-                bins[PIXEL] = -1;
-                privatized_decode_op[CHANNEL].template BinSelect<LOAD_MODIFIER>(samples[PIXEL][CHANNEL], bins[PIXEL], is_valid[PIXEL]);
-            }
-
-            CounterT accumulator = 1;
-
-            #pragma unroll
-            for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD - 1; ++PIXEL)
-            {
-                if (bins[PIXEL] != bins[PIXEL + 1])
-                {
-                    if (bins[PIXEL] >= 0)
-                        atomicAdd(privatized_histograms[CHANNEL] + bins[PIXEL], accumulator);
-
-                     accumulator = 0;
-                }
-                accumulator++;
-            }
-
-            // Last pixel
-            if (bins[PIXELS_PER_THREAD - 1] >= 0)
-                atomicAdd(privatized_histograms[CHANNEL] + bins[PIXELS_PER_THREAD - 1], accumulator);
-        }
-    }
-
-
-    // Accumulate pixels.  Specialized for individual accumulation of each pixel.
-    __device__ __forceinline__ void AccumulatePixels(
-        SampleT             samples[PIXELS_PER_THREAD][NUM_CHANNELS],
-        bool                is_valid[PIXELS_PER_THREAD],
-        CounterT*           privatized_histograms[NUM_ACTIVE_CHANNELS],
-        Int2Type<false>     is_rle_compress)
-    {
-        #pragma unroll
-        for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
-        {
-            #pragma unroll
-            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
-            {
-                int bin = -1;
-                privatized_decode_op[CHANNEL].template BinSelect<LOAD_MODIFIER>(samples[PIXEL][CHANNEL], bin, is_valid[PIXEL]);
-                if (bin >= 0)
-                    atomicAdd(privatized_histograms[CHANNEL] + bin, 1);
-            }
-        }
-    }
-
-
-    /**
-     * Accumulate pixel, specialized for smem privatized histogram
-     */
-    __device__ __forceinline__ void AccumulateSmemPixels(
-        SampleT             samples[PIXELS_PER_THREAD][NUM_CHANNELS],
-        bool                is_valid[PIXELS_PER_THREAD])
-    {
-        CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];
-
-        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
-            privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL];
-
-        AccumulatePixels(samples, is_valid, privatized_histograms, Int2Type<IS_RLE_COMPRESS>());
-    }
-
-
-    /**
-     * Accumulate pixel, specialized for gmem privatized histogram
-     */
-    __device__ __forceinline__ void AccumulateGmemPixels(
-        SampleT             samples[PIXELS_PER_THREAD][NUM_CHANNELS],
-        bool                is_valid[PIXELS_PER_THREAD])
-    {
-        AccumulatePixels(samples, is_valid, d_privatized_histograms, Int2Type<IS_RLE_COMPRESS>());
-    }
-
-
-
-    //---------------------------------------------------------------------
-    // Tile loading
-    //---------------------------------------------------------------------
-
-    // Load full, aligned tile using pixel iterator (multi-channel)
-    template <int _NUM_ACTIVE_CHANNELS>
-    __device__ __forceinline__ void LoadFullAlignedTile(
-        OffsetT                         block_offset,
-        int                             valid_samples,
-        SampleT                         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
-        Int2Type<_NUM_ACTIVE_CHANNELS>  num_active_channels)
-    {
-        typedef PixelT AliasedPixels[PIXELS_PER_THREAD];
-
-        WrappedPixelIteratorT d_wrapped_pixels((PixelT*) (d_native_samples + block_offset));
-
-        // Load using a wrapped pixel iterator
-        BlockLoadPixelT(temp_storage.aliasable.pixel_load).Load(
-            d_wrapped_pixels,
-            reinterpret_cast<AliasedPixels&>(samples));
-    }
-
-    // Load full, aligned tile using quad iterator (single-channel)
-    __device__ __forceinline__ void LoadFullAlignedTile(
-        OffsetT                         block_offset,
-        int                             valid_samples,
-        SampleT                         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
-        Int2Type<1>                     num_active_channels)
-    {
-        typedef QuadT AliasedQuads[QUADS_PER_THREAD];
-
-        WrappedQuadIteratorT d_wrapped_quads((QuadT*) (d_native_samples + block_offset));
-
-        // Load using a wrapped quad iterator
-        BlockLoadQuadT(temp_storage.aliasable.quad_load).Load(
-            d_wrapped_quads,
-            reinterpret_cast<AliasedQuads&>(samples));
-    }
-
-    // Load full, aligned tile
-    __device__ __forceinline__ void LoadTile(
-        OffsetT         block_offset,
-        int             valid_samples,
-        SampleT         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
-        Int2Type<true>  is_full_tile,
-        Int2Type<true>  is_aligned)
-    {
-        LoadFullAlignedTile(block_offset, valid_samples, samples, Int2Type<NUM_ACTIVE_CHANNELS>());
-    }
-
-    // Load full, mis-aligned tile using sample iterator
-    __device__ __forceinline__ void LoadTile(
-        OffsetT         block_offset,
-        int             valid_samples,
-        SampleT         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
-        Int2Type<true>  is_full_tile,
-        Int2Type<false> is_aligned)
-    {
-        typedef SampleT AliasedSamples[SAMPLES_PER_THREAD];
-
-        // Load using sample iterator
-        BlockLoadSampleT(temp_storage.aliasable.sample_load).Load(
-            d_wrapped_samples + block_offset,
-            reinterpret_cast<AliasedSamples&>(samples));
-    }
-
-    // Load partially-full, aligned tile using the pixel iterator
-    __device__ __forceinline__ void LoadTile(
-        OffsetT         block_offset,
-        int             valid_samples,
-        SampleT         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
-        Int2Type<false> is_full_tile,
-        Int2Type<true>  is_aligned)
-    {
-        typedef PixelT AliasedPixels[PIXELS_PER_THREAD];
-
-        WrappedPixelIteratorT d_wrapped_pixels((PixelT*) (d_native_samples + block_offset));
-
-        int valid_pixels = valid_samples / NUM_CHANNELS;
-
-        // Load using a wrapped pixel iterator
-        BlockLoadPixelT(temp_storage.aliasable.pixel_load).Load(
-            d_wrapped_pixels,
-            reinterpret_cast<AliasedPixels&>(samples),
-            valid_pixels);
-    }
-
-    // Load partially-full, mis-aligned tile using sample iterator
-    __device__ __forceinline__ void LoadTile(
-        OffsetT         block_offset,
-        int             valid_samples,
-        SampleT         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
-        Int2Type<false> is_full_tile,
-        Int2Type<false> is_aligned)
-    {
-        typedef SampleT AliasedSamples[SAMPLES_PER_THREAD];
-
-        BlockLoadSampleT(temp_storage.aliasable.sample_load).Load(
-            d_wrapped_samples + block_offset,
-            reinterpret_cast<AliasedSamples&>(samples),
-            valid_samples);
-    }
-
-
-    //---------------------------------------------------------------------
-    // Tile processing
-    //---------------------------------------------------------------------
-
-    // Consume a tile of data samples
-    template <
-        bool IS_ALIGNED,        // Whether the tile offset is aligned (quad-aligned for single-channel, pixel-aligned for multi-channel)
-        bool IS_FULL_TILE>      // Whether the tile is full
-    __device__ __forceinline__ void ConsumeTile(OffsetT block_offset, int valid_samples)
-    {
-        SampleT     samples[PIXELS_PER_THREAD][NUM_CHANNELS];
-        bool        is_valid[PIXELS_PER_THREAD];
-
-        // Load tile
-        LoadTile(
-            block_offset,
-            valid_samples,
-            samples,
-            Int2Type<IS_FULL_TILE>(),
-            Int2Type<IS_ALIGNED>());
-
-        // Set valid flags
-        #pragma unroll
-        for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
-            is_valid[PIXEL] = IS_FULL_TILE || (((threadIdx.x * PIXELS_PER_THREAD + PIXEL) * NUM_CHANNELS) < valid_samples);
-
-        // Accumulate samples
-#if CUB_PTX_ARCH >= 120
-        if (prefer_smem)
-            AccumulateSmemPixels(samples, is_valid);
-        else
-            AccumulateGmemPixels(samples, is_valid);
-#else
-        AccumulateGmemPixels(samples, is_valid);
-#endif
-
-    }
-
-
-    // Consume row tiles.  Specialized for work-stealing from queue
-    template <bool IS_ALIGNED>
-    __device__ __forceinline__ void ConsumeTiles(
-        OffsetT             num_row_pixels,             ///< The number of multi-channel pixels per row in the region of interest
-        OffsetT             num_rows,                   ///< The number of rows in the region of interest
-        OffsetT             row_stride_samples,         ///< The number of samples between starts of consecutive rows in the region of interest
-        int                 tiles_per_row,              ///< Number of image tiles per row
-        GridQueue<int>      tile_queue,
-        Int2Type<true>      is_work_stealing)
-    {
-
-        int         num_tiles                   = num_rows * tiles_per_row;
-        int         tile_idx                    = (blockIdx.y  * gridDim.x) + blockIdx.x;
-        OffsetT     num_even_share_tiles        = gridDim.x * gridDim.y;
-
-        while (tile_idx < num_tiles)
-        {
-            int     row             = tile_idx / tiles_per_row;
-            int     col             = tile_idx - (row * tiles_per_row);
-            OffsetT row_offset      = row * row_stride_samples;
-            OffsetT col_offset      = (col * TILE_SAMPLES);
-            OffsetT tile_offset     = row_offset + col_offset;
-
-            if (col == tiles_per_row - 1)
-            {
-                // Consume a partially-full tile at the end of the row
-                OffsetT num_remaining = (num_row_pixels * NUM_CHANNELS) - col_offset;
-                ConsumeTile<IS_ALIGNED, false>(tile_offset, num_remaining);
-            } 
-            else
-            {
-                // Consume full tile
-                ConsumeTile<IS_ALIGNED, true>(tile_offset, TILE_SAMPLES);
-            }
-
-            CTA_SYNC();
-
-            // Get next tile
-            if (threadIdx.x == 0)
-                temp_storage.tile_idx = tile_queue.Drain(1) + num_even_share_tiles;
-
-            CTA_SYNC();
-
-            tile_idx = temp_storage.tile_idx;
-        }
-    }
-
-
-    // Consume row tiles.  Specialized for even-share (striped across thread blocks)
-    template <bool IS_ALIGNED>
-    __device__ __forceinline__ void ConsumeTiles(
-        OffsetT             num_row_pixels,             ///< The number of multi-channel pixels per row in the region of interest
-        OffsetT             num_rows,                   ///< The number of rows in the region of interest
-        OffsetT             row_stride_samples,         ///< The number of samples between starts of consecutive rows in the region of interest
-        int                 tiles_per_row,              ///< Number of image tiles per row
-        GridQueue<int>      tile_queue,
-        Int2Type<false>     is_work_stealing)
-    {
-        for (int row = blockIdx.y; row < num_rows; row += gridDim.y)
-        {
-            OffsetT row_begin   = row * row_stride_samples;
-            OffsetT row_end     = row_begin + (num_row_pixels * NUM_CHANNELS);
-            OffsetT tile_offset = row_begin + (blockIdx.x * TILE_SAMPLES);
-
-            while (tile_offset < row_end)
-            {
-                OffsetT num_remaining = row_end - tile_offset;
-
-                if (num_remaining < TILE_SAMPLES)
-                {
-                    // Consume partial tile
-                    ConsumeTile<IS_ALIGNED, false>(tile_offset, num_remaining);
-                    break;
-                }
-
-                // Consume full tile
-                ConsumeTile<IS_ALIGNED, true>(tile_offset, TILE_SAMPLES);
-                tile_offset += gridDim.x * TILE_SAMPLES;
-            }
-        }
-    }
-
-
-    //---------------------------------------------------------------------
-    // Parameter extraction
-    //---------------------------------------------------------------------
-
-    // Return a native pixel pointer (specialized for CacheModifiedInputIterator types)
-    template <
-        CacheLoadModifier   _MODIFIER,
-        typename            _ValueT,
-        typename            _OffsetT>
-    __device__ __forceinline__ SampleT* NativePointer(CacheModifiedInputIterator<_MODIFIER, _ValueT, _OffsetT> itr)
-    {
-        return itr.ptr;
-    }
-
-    // Return a native pixel pointer (specialized for other types)
-    template <typename IteratorT>
-    __device__ __forceinline__ SampleT* NativePointer(IteratorT itr)
-    {
-        return NULL;
-    }
-
-
-
-    //---------------------------------------------------------------------
-    // Interface
-    //---------------------------------------------------------------------
-
-
-    /**
-     * Constructor
-     */
-    __device__ __forceinline__ AgentHistogram(
-        TempStorage         &temp_storage,                                      ///< Reference to temp_storage
-        SampleIteratorT     d_samples,                                          ///< Input data to reduce
-        int                 (&num_output_bins)[NUM_ACTIVE_CHANNELS],            ///< The number bins per final output histogram
-        int                 (&num_privatized_bins)[NUM_ACTIVE_CHANNELS],        ///< The number bins per privatized histogram
-        CounterT*           (&d_output_histograms)[NUM_ACTIVE_CHANNELS],        ///< Reference to final output histograms
-        CounterT*           (&d_privatized_histograms)[NUM_ACTIVE_CHANNELS],    ///< Reference to privatized histograms
-        OutputDecodeOpT     (&output_decode_op)[NUM_ACTIVE_CHANNELS],           ///< The transform operator for determining output bin-ids from privatized counter indices, one for each channel
-        PrivatizedDecodeOpT (&privatized_decode_op)[NUM_ACTIVE_CHANNELS])       ///< The transform operator for determining privatized counter indices from samples, one for each channel
-    :
-        temp_storage(temp_storage.Alias()),
-        d_wrapped_samples(d_samples),
-        num_output_bins(num_output_bins),
-        num_privatized_bins(num_privatized_bins),
-        d_output_histograms(d_output_histograms),
-        privatized_decode_op(privatized_decode_op),
-        output_decode_op(output_decode_op),
-        d_native_samples(NativePointer(d_wrapped_samples)),
-        prefer_smem((MEM_PREFERENCE == SMEM) ?
-            true :                              // prefer smem privatized histograms
-            (MEM_PREFERENCE == GMEM) ?
-                false :                         // prefer gmem privatized histograms
-                blockIdx.x & 1)                 // prefer blended privatized histograms
-    {
-        int blockId = (blockIdx.y * gridDim.x) + blockIdx.x;
-
-        // Initialize the locations of this block's privatized histograms
-        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
-            this->d_privatized_histograms[CHANNEL] = d_privatized_histograms[CHANNEL] + (blockId * num_privatized_bins[CHANNEL]);
-    }
-
-
-    /**
-     * Consume image
-     */
-    __device__ __forceinline__ void ConsumeTiles(
-        OffsetT             num_row_pixels,             ///< The number of multi-channel pixels per row in the region of interest
-        OffsetT             num_rows,                   ///< The number of rows in the region of interest
-        OffsetT             row_stride_samples,         ///< The number of samples between starts of consecutive rows in the region of interest
-        int                 tiles_per_row,              ///< Number of image tiles per row
-        GridQueue<int>      tile_queue)                 ///< Queue descriptor for assigning tiles of work to thread blocks
-    {
-        // Check whether all row starting offsets are quad-aligned (in single-channel) or pixel-aligned (in multi-channel)
-        int     quad_mask           = AlignBytes<QuadT>::ALIGN_BYTES - 1;
-        int     pixel_mask          = AlignBytes<PixelT>::ALIGN_BYTES - 1;
-        size_t  row_bytes           = sizeof(SampleT) * row_stride_samples;
-
-        bool quad_aligned_rows      = (NUM_CHANNELS == 1) && (SAMPLES_PER_THREAD % 4 == 0) &&     // Single channel
-                                        ((size_t(d_native_samples) & quad_mask) == 0) &&        // ptr is quad-aligned
-                                        ((num_rows == 1) || ((row_bytes & quad_mask) == 0));    // number of row-samples is a multiple of the alignment of the quad
-
-        bool pixel_aligned_rows     = (NUM_CHANNELS > 1) &&                                     // Multi channel
-                                        ((size_t(d_native_samples) & pixel_mask) == 0) &&       // ptr is pixel-aligned
-                                        ((row_bytes & pixel_mask) == 0);                        // number of row-samples is a multiple of the alignment of the pixel
-
-        // Whether rows are aligned and can be vectorized
-        if ((d_native_samples != NULL) && (quad_aligned_rows || pixel_aligned_rows))
-            ConsumeTiles<true>(num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, Int2Type<IS_WORK_STEALING>());
-        else
-            ConsumeTiles<false>(num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, Int2Type<IS_WORK_STEALING>());
-    }
-
-
-    /**
-     * Initialize privatized bin counters.  Specialized for privatized shared-memory counters
-     */
-    __device__ __forceinline__ void InitBinCounters()
-    {
-        if (prefer_smem)
-            InitSmemBinCounters();
-        else
-            InitGmemBinCounters();
-    }
-
-
-    /**
-     * Store privatized histogram to device-accessible memory.  Specialized for privatized shared-memory counters
-     */
-    __device__ __forceinline__ void StoreOutput()
-    {
-        if (prefer_smem)
-            StoreSmemOutput();
-        else
-            StoreGmemOutput();
-    }
-
-
-};
-
-
-
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/agent/agent_radix_sort_downsweep.cuh b/thrust/system/cuda/detail/cub/agent/agent_radix_sort_downsweep.cuh
deleted file mode 100644
index 1b1fd8a3e..000000000
--- a/thrust/system/cuda/detail/cub/agent/agent_radix_sort_downsweep.cuh
+++ /dev/null
@@ -1,789 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * AgentRadixSortDownsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort downsweep .
- */
-
-
-#pragma once
-
-#include <stdint.h>
-
-#include "../thread/thread_load.cuh"
-#include "../block/block_load.cuh"
-#include "../block/block_store.cuh"
-#include "../block/block_radix_rank.cuh"
-#include "../block/block_exchange.cuh"
-#include "../util_type.cuh"
-#include "../iterator/cache_modified_input_iterator.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Tuning policy types
- ******************************************************************************/
-
-/**
- * Radix ranking algorithm
- */
-enum RadixRankAlgorithm
-{
-    RADIX_RANK_BASIC,
-    RADIX_RANK_MEMOIZE,
-    RADIX_RANK_MATCH
-};
-
-/**
- * Parameterizable tuning policy type for AgentRadixSortDownsweep
- */
-template <
-    int                         _BLOCK_THREADS,         ///< Threads per thread block
-    int                         _ITEMS_PER_THREAD,      ///< Items per thread (per tile of input)
-    BlockLoadAlgorithm          _LOAD_ALGORITHM,        ///< The BlockLoad algorithm to use
-    CacheLoadModifier           _LOAD_MODIFIER,         ///< Cache load modifier for reading keys (and values)
-    RadixRankAlgorithm          _RANK_ALGORITHM,        ///< The radix ranking algorithm to use
-    BlockScanAlgorithm          _SCAN_ALGORITHM,        ///< The block scan algorithm to use
-    int                         _RADIX_BITS>            ///< The number of radix bits, i.e., log2(bins)
-struct AgentRadixSortDownsweepPolicy
-{
-    enum
-    {
-        BLOCK_THREADS           = _BLOCK_THREADS,           ///< Threads per thread block
-        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,        ///< Items per thread (per tile of input)
-        RADIX_BITS              = _RADIX_BITS,              ///< The number of radix bits, i.e., log2(bins)
-    };
-
-    static const BlockLoadAlgorithm  LOAD_ALGORITHM     = _LOAD_ALGORITHM;    ///< The BlockLoad algorithm to use
-    static const CacheLoadModifier   LOAD_MODIFIER      = _LOAD_MODIFIER;     ///< Cache load modifier for reading keys (and values)
-    static const RadixRankAlgorithm  RANK_ALGORITHM     = _RANK_ALGORITHM;    ///< The radix ranking algorithm to use
-    static const BlockScanAlgorithm  SCAN_ALGORITHM     = _SCAN_ALGORITHM;    ///< The BlockScan algorithm to use
-};
-
-
-/******************************************************************************
- * Thread block abstractions
- ******************************************************************************/
-
-
-
-
-
-/**
- * \brief AgentRadixSortDownsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort downsweep .
- */
-template <
-    typename AgentRadixSortDownsweepPolicy,     ///< Parameterized AgentRadixSortDownsweepPolicy tuning policy type
-    bool     IS_DESCENDING,                     ///< Whether or not the sorted-order is high-to-low
-    typename KeyT,                              ///< KeyT type
-    typename ValueT,                            ///< ValueT type
-    typename OffsetT>                           ///< Signed integer type for global offsets
-struct AgentRadixSortDownsweep
-{
-    //---------------------------------------------------------------------
-    // Type definitions and constants
-    //---------------------------------------------------------------------
-
-    // Appropriate unsigned-bits representation of KeyT
-    typedef typename Traits<KeyT>::UnsignedBits UnsignedBits;
-
-    static const UnsignedBits           LOWEST_KEY  = Traits<KeyT>::LOWEST_KEY;
-    static const UnsignedBits           MAX_KEY     = Traits<KeyT>::MAX_KEY;
-
-    static const BlockLoadAlgorithm     LOAD_ALGORITHM  = AgentRadixSortDownsweepPolicy::LOAD_ALGORITHM;
-    static const CacheLoadModifier      LOAD_MODIFIER   = AgentRadixSortDownsweepPolicy::LOAD_MODIFIER;
-    static const RadixRankAlgorithm     RANK_ALGORITHM  = AgentRadixSortDownsweepPolicy::RANK_ALGORITHM;
-    static const BlockScanAlgorithm     SCAN_ALGORITHM  = AgentRadixSortDownsweepPolicy::SCAN_ALGORITHM;
-
-    enum
-    {
-        BLOCK_THREADS           = AgentRadixSortDownsweepPolicy::BLOCK_THREADS,
-        ITEMS_PER_THREAD        = AgentRadixSortDownsweepPolicy::ITEMS_PER_THREAD,
-        RADIX_BITS              = AgentRadixSortDownsweepPolicy::RADIX_BITS,
-        TILE_ITEMS              = BLOCK_THREADS * ITEMS_PER_THREAD,
-
-        RADIX_DIGITS            = 1 << RADIX_BITS,
-        KEYS_ONLY               = Equals<ValueT, NullType>::VALUE,
-    };
-
-    // Input iterator wrapper type (for applying cache modifier)s
-    typedef CacheModifiedInputIterator<LOAD_MODIFIER, UnsignedBits, OffsetT>    KeysItr;
-    typedef CacheModifiedInputIterator<LOAD_MODIFIER, ValueT, OffsetT>          ValuesItr;
-
-    // Radix ranking type to use
-    typedef typename If<(RANK_ALGORITHM == RADIX_RANK_BASIC),
-            BlockRadixRank<BLOCK_THREADS, RADIX_BITS, IS_DESCENDING, false, SCAN_ALGORITHM>,
-            typename If<(RANK_ALGORITHM == RADIX_RANK_MEMOIZE),
-                BlockRadixRank<BLOCK_THREADS, RADIX_BITS, IS_DESCENDING, true, SCAN_ALGORITHM>,
-                BlockRadixRankMatch<BLOCK_THREADS, RADIX_BITS, IS_DESCENDING, SCAN_ALGORITHM>
-            >::Type
-        >::Type BlockRadixRankT;
-
-    enum
-    {
-        /// Number of bin-starting offsets tracked per thread
-        BINS_TRACKED_PER_THREAD = BlockRadixRankT::BINS_TRACKED_PER_THREAD
-    };
-
-    // BlockLoad type (keys)
-    typedef BlockLoad<
-        UnsignedBits,
-        BLOCK_THREADS,
-        ITEMS_PER_THREAD,
-        LOAD_ALGORITHM> BlockLoadKeysT;
-
-    // BlockLoad type (values)
-    typedef BlockLoad<
-        ValueT,
-        BLOCK_THREADS,
-        ITEMS_PER_THREAD,
-        LOAD_ALGORITHM> BlockLoadValuesT;
-
-    // Value exchange array type
-    typedef ValueT ValueExchangeT[TILE_ITEMS];
-
-    /**
-     * Shared memory storage layout
-     */
-    union __align__(16) _TempStorage
-    {
-        typename BlockLoadKeysT::TempStorage    load_keys;
-        typename BlockLoadValuesT::TempStorage  load_values;
-        typename BlockRadixRankT::TempStorage   radix_rank;
-
-        struct
-        {
-            UnsignedBits                        exchange_keys[TILE_ITEMS];
-            OffsetT                             relative_bin_offsets[RADIX_DIGITS];
-        };
-
-        Uninitialized<ValueExchangeT>           exchange_values;
-
-        OffsetT                                 exclusive_digit_prefix[RADIX_DIGITS];
-    };
-
-
-    /// Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    //---------------------------------------------------------------------
-    // Thread fields
-    //---------------------------------------------------------------------
-
-    // Shared storage for this CTA
-    _TempStorage    &temp_storage;
-
-    // Input and output device pointers
-    KeysItr         d_keys_in;
-    ValuesItr       d_values_in;
-    UnsignedBits    *d_keys_out;
-    ValueT          *d_values_out;
-
-    // The global scatter base offset for each digit (valid in the first RADIX_DIGITS threads)
-    OffsetT         bin_offset[BINS_TRACKED_PER_THREAD];
-
-    // The least-significant bit position of the current digit to extract
-    int             current_bit;
-
-    // Number of bits in current digit
-    int             num_bits;
-
-    // Whether to short-cirucit
-    int             short_circuit;
-
-    //---------------------------------------------------------------------
-    // Utility methods
-    //---------------------------------------------------------------------
-
-
-    /**
-     * Scatter ranked keys through shared memory, then to device-accessible memory
-     */
-    template <bool FULL_TILE>
-    __device__ __forceinline__ void ScatterKeys(
-        UnsignedBits    (&twiddled_keys)[ITEMS_PER_THREAD],
-        OffsetT         (&relative_bin_offsets)[ITEMS_PER_THREAD],
-        int             (&ranks)[ITEMS_PER_THREAD],
-        OffsetT         valid_items)
-    {
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            temp_storage.exchange_keys[ranks[ITEM]] = twiddled_keys[ITEM];
-        }
-
-        CTA_SYNC();
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            UnsignedBits key            = temp_storage.exchange_keys[threadIdx.x + (ITEM * BLOCK_THREADS)];
-            UnsignedBits digit          = BFE(key, current_bit, num_bits);
-            relative_bin_offsets[ITEM]  = temp_storage.relative_bin_offsets[digit];
-
-            // Un-twiddle
-            key = Traits<KeyT>::TwiddleOut(key);
-
-            if (FULL_TILE || 
-                (static_cast<OffsetT>(threadIdx.x + (ITEM * BLOCK_THREADS)) < valid_items))
-            {
-                d_keys_out[relative_bin_offsets[ITEM] + threadIdx.x + (ITEM * BLOCK_THREADS)] = key;
-            }
-        }
-    }
-
-
-    /**
-     * Scatter ranked values through shared memory, then to device-accessible memory
-     */
-    template <bool FULL_TILE>
-    __device__ __forceinline__ void ScatterValues(
-        ValueT      (&values)[ITEMS_PER_THREAD],
-        OffsetT     (&relative_bin_offsets)[ITEMS_PER_THREAD],
-        int         (&ranks)[ITEMS_PER_THREAD],
-        OffsetT     valid_items)
-    {
-        CTA_SYNC();
-
-        ValueExchangeT &exchange_values = temp_storage.exchange_values.Alias();
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            exchange_values[ranks[ITEM]] = values[ITEM];
-        }
-
-        CTA_SYNC();
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            ValueT value = exchange_values[threadIdx.x + (ITEM * BLOCK_THREADS)];
-
-            if (FULL_TILE ||
-                (static_cast<OffsetT>(threadIdx.x + (ITEM * BLOCK_THREADS)) < valid_items))
-            {
-                d_values_out[relative_bin_offsets[ITEM] + threadIdx.x + (ITEM * BLOCK_THREADS)] = value;
-            }
-        }
-    }
-
-    /**
-     * Load a tile of keys (specialized for full tile, any ranking algorithm)
-     */
-    template <int _RANK_ALGORITHM>
-    __device__ __forceinline__ void LoadKeys(
-        UnsignedBits                (&keys)[ITEMS_PER_THREAD],
-        OffsetT                     block_offset,
-        OffsetT                     valid_items,
-        UnsignedBits                oob_item,
-        Int2Type<true>              is_full_tile,
-        Int2Type<_RANK_ALGORITHM>   rank_algorithm)
-    {
-        BlockLoadKeysT(temp_storage.load_keys).Load(
-            d_keys_in + block_offset, keys);
-
-        CTA_SYNC();
-    }
-
-
-    /**
-     * Load a tile of keys (specialized for partial tile, any ranking algorithm)
-     */
-    template <int _RANK_ALGORITHM>
-    __device__ __forceinline__ void LoadKeys(
-        UnsignedBits                (&keys)[ITEMS_PER_THREAD],
-        OffsetT                     block_offset,
-        OffsetT                     valid_items,
-        UnsignedBits                oob_item,
-        Int2Type<false>             is_full_tile,
-        Int2Type<_RANK_ALGORITHM>   rank_algorithm)
-    {
-        // Register pressure work-around: moving valid_items through shfl prevents compiler
-        // from reusing guards/addressing from prior guarded loads
-        valid_items = ShuffleIndex(valid_items, 0, CUB_PTX_WARP_THREADS, 0xffffffff);
-
-        BlockLoadKeysT(temp_storage.load_keys).Load(
-            d_keys_in + block_offset, keys, valid_items, oob_item);
-
-        CTA_SYNC();
-    }
-
-
-    /**
-     * Load a tile of keys (specialized for full tile, match ranking algorithm)
-     */
-    __device__ __forceinline__ void LoadKeys(
-        UnsignedBits                (&keys)[ITEMS_PER_THREAD],
-        OffsetT                     block_offset,
-        OffsetT                     valid_items,
-        UnsignedBits                oob_item,
-        Int2Type<true>              is_full_tile,
-        Int2Type<RADIX_RANK_MATCH>  rank_algorithm)
-    {
-        LoadDirectWarpStriped(threadIdx.x, d_keys_in + block_offset, keys);
-    }
-
-
-    /**
-     * Load a tile of keys (specialized for partial tile, match ranking algorithm)
-     */
-    __device__ __forceinline__ void LoadKeys(
-        UnsignedBits                (&keys)[ITEMS_PER_THREAD],
-        OffsetT                     block_offset,
-        OffsetT                     valid_items,
-        UnsignedBits                oob_item,
-        Int2Type<false>             is_full_tile,
-        Int2Type<RADIX_RANK_MATCH>  rank_algorithm)
-    {
-        // Register pressure work-around: moving valid_items through shfl prevents compiler
-        // from reusing guards/addressing from prior guarded loads
-        valid_items = ShuffleIndex(valid_items, 0, CUB_PTX_WARP_THREADS, 0xffffffff);
-
-        LoadDirectWarpStriped(threadIdx.x, d_keys_in + block_offset, keys, valid_items, oob_item);
-    }
-
-
-    /**
-     * Load a tile of values (specialized for full tile, any ranking algorithm)
-     */
-    template <int _RANK_ALGORITHM>
-    __device__ __forceinline__ void LoadValues(
-        ValueT                      (&values)[ITEMS_PER_THREAD],
-        OffsetT                     block_offset,
-        OffsetT                     valid_items,
-        Int2Type<true>              is_full_tile,
-        Int2Type<_RANK_ALGORITHM>   rank_algorithm)
-    {
-        BlockLoadValuesT(temp_storage.load_values).Load(
-            d_values_in + block_offset, values);
-
-        CTA_SYNC();
-    }
-
-
-    /**
-     * Load a tile of values (specialized for partial tile, any ranking algorithm)
-     */
-    template <int _RANK_ALGORITHM>
-    __device__ __forceinline__ void LoadValues(
-        ValueT                      (&values)[ITEMS_PER_THREAD],
-        OffsetT                     block_offset,
-        OffsetT                     valid_items,
-        Int2Type<false>             is_full_tile,
-        Int2Type<_RANK_ALGORITHM>   rank_algorithm)
-    {
-        // Register pressure work-around: moving valid_items through shfl prevents compiler
-        // from reusing guards/addressing from prior guarded loads
-        valid_items = ShuffleIndex(valid_items, 0, CUB_PTX_WARP_THREADS, 0xffffffff);
-
-        BlockLoadValuesT(temp_storage.load_values).Load(
-            d_values_in + block_offset, values, valid_items);
-
-        CTA_SYNC();
-    }
-
-
-    /**
-     * Load a tile of items (specialized for full tile, match ranking algorithm)
-     */
-    __device__ __forceinline__ void LoadValues(
-        ValueT                      (&values)[ITEMS_PER_THREAD],
-        OffsetT                     block_offset,
-        OffsetT                     valid_items,
-        Int2Type<true>              is_full_tile,
-        Int2Type<RADIX_RANK_MATCH>  rank_algorithm)
-    {
-        LoadDirectWarpStriped(threadIdx.x, d_values_in + block_offset, values);
-    }
-
-
-    /**
-     * Load a tile of items (specialized for partial tile, match ranking algorithm)
-     */
-    __device__ __forceinline__ void LoadValues(
-        ValueT                      (&values)[ITEMS_PER_THREAD],
-        OffsetT                     block_offset,
-        OffsetT                     valid_items,
-        Int2Type<false>             is_full_tile,
-        Int2Type<RADIX_RANK_MATCH>  rank_algorithm)
-    {
-        // Register pressure work-around: moving valid_items through shfl prevents compiler
-        // from reusing guards/addressing from prior guarded loads
-        valid_items = ShuffleIndex(valid_items, 0, CUB_PTX_WARP_THREADS, 0xffffffff);
-
-        LoadDirectWarpStriped(threadIdx.x, d_values_in + block_offset, values, valid_items);
-    }
-
-
-    /**
-     * Truck along associated values
-     */
-    template <bool FULL_TILE>
-    __device__ __forceinline__ void GatherScatterValues(
-        OffsetT         (&relative_bin_offsets)[ITEMS_PER_THREAD],
-        int             (&ranks)[ITEMS_PER_THREAD],
-        OffsetT         block_offset,
-        OffsetT         valid_items,
-        Int2Type<false> /*is_keys_only*/)
-    {
-        ValueT values[ITEMS_PER_THREAD];
-
-        CTA_SYNC();
-
-        LoadValues(
-            values,
-            block_offset,
-            valid_items,
-            Int2Type<FULL_TILE>(),
-            Int2Type<RANK_ALGORITHM>());
-
-        ScatterValues<FULL_TILE>(
-            values,
-            relative_bin_offsets,
-            ranks,
-            valid_items);
-    }
-
-
-    /**
-     * Truck along associated values (specialized for key-only sorting)
-     */
-    template <bool FULL_TILE>
-    __device__ __forceinline__ void GatherScatterValues(
-        OffsetT         (&/*relative_bin_offsets*/)[ITEMS_PER_THREAD],
-        int             (&/*ranks*/)[ITEMS_PER_THREAD],
-        OffsetT         /*block_offset*/,
-        OffsetT         /*valid_items*/,
-        Int2Type<true>  /*is_keys_only*/)
-    {}
-
-
-    /**
-     * Process tile
-     */
-    template <bool FULL_TILE>
-    __device__ __forceinline__ void ProcessTile(
-        OffsetT block_offset,
-        const OffsetT &valid_items = TILE_ITEMS)
-    {
-        UnsignedBits    keys[ITEMS_PER_THREAD];
-        int             ranks[ITEMS_PER_THREAD];
-        OffsetT         relative_bin_offsets[ITEMS_PER_THREAD];
-
-        // Assign default (min/max) value to all keys
-        UnsignedBits default_key = (IS_DESCENDING) ? LOWEST_KEY : MAX_KEY;
-
-        // Load tile of keys
-        LoadKeys(
-            keys,
-            block_offset,
-            valid_items, 
-            default_key,
-            Int2Type<FULL_TILE>(),
-            Int2Type<RANK_ALGORITHM>());
-
-        // Twiddle key bits if necessary
-        #pragma unroll
-        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
-        {
-            keys[KEY] = Traits<KeyT>::TwiddleIn(keys[KEY]);
-        }
-
-        // Rank the twiddled keys
-        int exclusive_digit_prefix[BINS_TRACKED_PER_THREAD];
-        BlockRadixRankT(temp_storage.radix_rank).RankKeys(
-            keys,
-            ranks,
-            current_bit,
-            num_bits,
-            exclusive_digit_prefix);
-
-        CTA_SYNC();
-
-        // Share exclusive digit prefix
-        #pragma unroll
-        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
-        {
-            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
-            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
-            {
-                // Store exclusive prefix
-                temp_storage.exclusive_digit_prefix[bin_idx] =
-                    exclusive_digit_prefix[track];
-            }
-        }
-
-        CTA_SYNC();
-
-        // Get inclusive digit prefix
-        int inclusive_digit_prefix[BINS_TRACKED_PER_THREAD];
-
-        #pragma unroll
-        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
-        {
-            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
-            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
-            {
-                if (IS_DESCENDING)
-                {
-                    // Get inclusive digit prefix from exclusive prefix (higher bins come first)
-                    inclusive_digit_prefix[track] = (bin_idx == 0) ?
-                        (BLOCK_THREADS * ITEMS_PER_THREAD) :
-                        temp_storage.exclusive_digit_prefix[bin_idx - 1];
-                }
-                else
-                {
-                    // Get inclusive digit prefix from exclusive prefix (lower bins come first)
-                    inclusive_digit_prefix[track] = (bin_idx == RADIX_DIGITS - 1) ?
-                        (BLOCK_THREADS * ITEMS_PER_THREAD) :
-                        temp_storage.exclusive_digit_prefix[bin_idx + 1];
-                }
-            }
-        }
-
-        CTA_SYNC();
-
-        // Update global scatter base offsets for each digit
-        #pragma unroll
-        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
-        {
-            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
-            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
-            {
-                bin_offset[track] -= exclusive_digit_prefix[track];
-                temp_storage.relative_bin_offsets[bin_idx] = bin_offset[track];
-                bin_offset[track] += inclusive_digit_prefix[track];
-            }
-        }
-
-        CTA_SYNC();
-
-        // Scatter keys
-        ScatterKeys<FULL_TILE>(keys, relative_bin_offsets, ranks, valid_items);
-
-        // Gather/scatter values
-        GatherScatterValues<FULL_TILE>(relative_bin_offsets , ranks, block_offset, valid_items, Int2Type<KEYS_ONLY>());
-    }
-
-    //---------------------------------------------------------------------
-    // Copy shortcut
-    //---------------------------------------------------------------------
-
-    /**
-     * Copy tiles within the range of input
-     */
-    template <
-        typename InputIteratorT,
-        typename T>
-    __device__ __forceinline__ void Copy(
-        InputIteratorT  d_in,
-        T               *d_out,
-        OffsetT         block_offset,
-        OffsetT         block_end)
-    {
-        // Simply copy the input
-        while (block_offset + TILE_ITEMS <= block_end)
-        {
-            T items[ITEMS_PER_THREAD];
-
-            LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_in + block_offset, items);
-            CTA_SYNC();
-            StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_out + block_offset, items);
-
-            block_offset += TILE_ITEMS;
-        }
-
-        // Clean up last partial tile with guarded-I/O
-        if (block_offset < block_end)
-        {
-            OffsetT valid_items = block_end - block_offset;
-
-            T items[ITEMS_PER_THREAD];
-
-            LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_in + block_offset, items, valid_items);
-            CTA_SYNC();
-            StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_out + block_offset, items, valid_items);
-        }
-    }
-
-
-    /**
-     * Copy tiles within the range of input (specialized for NullType)
-     */
-    template <typename InputIteratorT>
-    __device__ __forceinline__ void Copy(
-        InputIteratorT  /*d_in*/,
-        NullType        * /*d_out*/,
-        OffsetT         /*block_offset*/,
-        OffsetT         /*block_end*/)
-    {}
-
-
-    //---------------------------------------------------------------------
-    // Interface
-    //---------------------------------------------------------------------
-
-    /**
-     * Constructor
-     */
-    __device__ __forceinline__ AgentRadixSortDownsweep(
-        TempStorage     &temp_storage,
-        OffsetT         (&bin_offset)[BINS_TRACKED_PER_THREAD],
-        OffsetT         num_items,
-        const KeyT      *d_keys_in,
-        KeyT            *d_keys_out,
-        const ValueT    *d_values_in,
-        ValueT          *d_values_out,
-        int             current_bit,
-        int             num_bits)
-    :
-        temp_storage(temp_storage.Alias()),
-        d_keys_in(reinterpret_cast<const UnsignedBits*>(d_keys_in)),
-        d_values_in(d_values_in),
-        d_keys_out(reinterpret_cast<UnsignedBits*>(d_keys_out)),
-        d_values_out(d_values_out),
-        current_bit(current_bit),
-        num_bits(num_bits),
-        short_circuit(1)
-    {
-        #pragma unroll
-        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
-        {
-            this->bin_offset[track] = bin_offset[track];
-
-            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
-            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
-            {
-                // Short circuit if the histogram has only bin counts of only zeros or problem-size
-                short_circuit = short_circuit && ((bin_offset[track] == 0) || (bin_offset[track] == num_items));
-            }
-        }
-
-        short_circuit = CTA_SYNC_AND(short_circuit);
-    }
-
-
-    /**
-     * Constructor
-     */
-    __device__ __forceinline__ AgentRadixSortDownsweep(
-        TempStorage     &temp_storage,
-        OffsetT         num_items,
-        OffsetT         *d_spine,
-        const KeyT      *d_keys_in,
-        KeyT            *d_keys_out,
-        const ValueT    *d_values_in,
-        ValueT          *d_values_out,
-        int             current_bit,
-        int             num_bits)
-    :
-        temp_storage(temp_storage.Alias()),
-        d_keys_in(reinterpret_cast<const UnsignedBits*>(d_keys_in)),
-        d_values_in(d_values_in),
-        d_keys_out(reinterpret_cast<UnsignedBits*>(d_keys_out)),
-        d_values_out(d_values_out),
-        current_bit(current_bit),
-        num_bits(num_bits),
-        short_circuit(1)
-    {
-        #pragma unroll
-        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
-        {
-            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
-
-            // Load digit bin offsets (each of the first RADIX_DIGITS threads will load an offset for that digit)
-            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
-            {
-                if (IS_DESCENDING)
-                    bin_idx = RADIX_DIGITS - bin_idx - 1;
-
-                // Short circuit if the first block's histogram has only bin counts of only zeros or problem-size
-                OffsetT first_block_bin_offset = d_spine[gridDim.x * bin_idx];
-                short_circuit = short_circuit && ((first_block_bin_offset == 0) || (first_block_bin_offset == num_items));
-
-                // Load my block's bin offset for my bin
-                bin_offset[track] = d_spine[(gridDim.x * bin_idx) + blockIdx.x];
-            }
-        }
-
-        short_circuit = CTA_SYNC_AND(short_circuit);
-    }
-
-
-    /**
-     * Distribute keys from a segment of input tiles.
-     */
-    __device__ __forceinline__ void ProcessRegion(
-        OffsetT   block_offset,
-        OffsetT   block_end)
-    {
-        if (short_circuit)
-        {
-            // Copy keys
-            Copy(d_keys_in, d_keys_out, block_offset, block_end);
-
-            // Copy values
-            Copy(d_values_in, d_values_out, block_offset, block_end);
-        }
-        else
-        {
-            // Process full tiles of tile_items
-            #pragma unroll 1
-            while (block_offset + TILE_ITEMS <= block_end)
-            {
-                ProcessTile<true>(block_offset);
-                block_offset += TILE_ITEMS;
-
-                CTA_SYNC();
-            }
-
-            // Clean up last partial tile with guarded-I/O
-            if (block_offset < block_end)
-            {
-                ProcessTile<false>(block_offset, block_end - block_offset);
-            }
-
-        }
-    }
-
-};
-
-
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/agent/agent_radix_sort_upsweep.cuh b/thrust/system/cuda/detail/cub/agent/agent_radix_sort_upsweep.cuh
deleted file mode 100644
index efa69858d..000000000
--- a/thrust/system/cuda/detail/cub/agent/agent_radix_sort_upsweep.cuh
+++ /dev/null
@@ -1,526 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * AgentRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep .
- */
-
-#pragma once
-
-#include "../thread/thread_reduce.cuh"
-#include "../thread/thread_load.cuh"
-#include "../warp/warp_reduce.cuh"
-#include "../block/block_load.cuh"
-#include "../util_type.cuh"
-#include "../iterator/cache_modified_input_iterator.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/******************************************************************************
- * Tuning policy types
- ******************************************************************************/
-
-/**
- * Parameterizable tuning policy type for AgentRadixSortUpsweep
- */
-template <
-    int                 _BLOCK_THREADS,     ///< Threads per thread block
-    int                 _ITEMS_PER_THREAD,  ///< Items per thread (per tile of input)
-    CacheLoadModifier   _LOAD_MODIFIER,     ///< Cache load modifier for reading keys
-    int                 _RADIX_BITS>        ///< The number of radix bits, i.e., log2(bins)
-struct AgentRadixSortUpsweepPolicy
-{
-    enum
-    {
-        BLOCK_THREADS       = _BLOCK_THREADS,       ///< Threads per thread block
-        ITEMS_PER_THREAD    = _ITEMS_PER_THREAD,    ///< Items per thread (per tile of input)
-        RADIX_BITS          = _RADIX_BITS,          ///< The number of radix bits, i.e., log2(bins)
-    };
-
-    static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER;      ///< Cache load modifier for reading keys
-};
-
-
-/******************************************************************************
- * Thread block abstractions
- ******************************************************************************/
-
-/**
- * \brief AgentRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep .
- */
-template <
-    typename AgentRadixSortUpsweepPolicy,   ///< Parameterized AgentRadixSortUpsweepPolicy tuning policy type
-    typename KeyT,                          ///< KeyT type
-    typename OffsetT>                       ///< Signed integer type for global offsets
-struct AgentRadixSortUpsweep
-{
-
-    //---------------------------------------------------------------------
-    // Type definitions and constants
-    //---------------------------------------------------------------------
-
-    typedef typename Traits<KeyT>::UnsignedBits UnsignedBits;
-
-    // Integer type for digit counters (to be packed into words of PackedCounters)
-    typedef unsigned char DigitCounter;
-
-    // Integer type for packing DigitCounters into columns of shared memory banks
-    typedef unsigned int PackedCounter;
-
-    static const CacheLoadModifier LOAD_MODIFIER = AgentRadixSortUpsweepPolicy::LOAD_MODIFIER;
-
-    enum
-    {
-        RADIX_BITS              = AgentRadixSortUpsweepPolicy::RADIX_BITS,
-        BLOCK_THREADS           = AgentRadixSortUpsweepPolicy::BLOCK_THREADS,
-        KEYS_PER_THREAD         = AgentRadixSortUpsweepPolicy::ITEMS_PER_THREAD,
-
-        RADIX_DIGITS            = 1 << RADIX_BITS,
-
-        LOG_WARP_THREADS        = CUB_PTX_LOG_WARP_THREADS,
-        WARP_THREADS            = 1 << LOG_WARP_THREADS,
-        WARPS                   = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
-
-        TILE_ITEMS              = BLOCK_THREADS * KEYS_PER_THREAD,
-
-        BYTES_PER_COUNTER       = sizeof(DigitCounter),
-        LOG_BYTES_PER_COUNTER   = Log2<BYTES_PER_COUNTER>::VALUE,
-
-        PACKING_RATIO           = sizeof(PackedCounter) / sizeof(DigitCounter),
-        LOG_PACKING_RATIO       = Log2<PACKING_RATIO>::VALUE,
-
-        LOG_COUNTER_LANES       = CUB_MAX(0, RADIX_BITS - LOG_PACKING_RATIO),
-        COUNTER_LANES           = 1 << LOG_COUNTER_LANES,
-
-        // To prevent counter overflow, we must periodically unpack and aggregate the
-        // digit counters back into registers.  Each counter lane is assigned to a
-        // warp for aggregation.
-
-        LANES_PER_WARP          = CUB_MAX(1, (COUNTER_LANES + WARPS - 1) / WARPS),
-
-        // Unroll tiles in batches without risk of counter overflow
-        UNROLL_COUNT            = CUB_MIN(64, 255 / KEYS_PER_THREAD),
-        UNROLLED_ELEMENTS       = UNROLL_COUNT * TILE_ITEMS,
-    };
-
-
-    // Input iterator wrapper type (for applying cache modifier)s
-    typedef CacheModifiedInputIterator<LOAD_MODIFIER, UnsignedBits, OffsetT> KeysItr;
-
-    /**
-     * Shared memory storage layout
-     */
-    union __align__(16) _TempStorage
-    {
-        DigitCounter    thread_counters[COUNTER_LANES][BLOCK_THREADS][PACKING_RATIO];
-        PackedCounter   packed_thread_counters[COUNTER_LANES][BLOCK_THREADS];
-        OffsetT         block_counters[WARP_THREADS][RADIX_DIGITS];
-    };
-
-
-    /// Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    //---------------------------------------------------------------------
-    // Thread fields (aggregate state bundle)
-    //---------------------------------------------------------------------
-
-    // Shared storage for this CTA
-    _TempStorage    &temp_storage;
-
-    // Thread-local counters for periodically aggregating composite-counter lanes
-    OffsetT         local_counts[LANES_PER_WARP][PACKING_RATIO];
-
-    // Input and output device pointers
-    KeysItr         d_keys_in;
-
-    // The least-significant bit position of the current digit to extract
-    int             current_bit;
-
-    // Number of bits in current digit
-    int             num_bits;
-
-
-
-    //---------------------------------------------------------------------
-    // Helper structure for templated iteration
-    //---------------------------------------------------------------------
-
-    // Iterate
-    template <int COUNT, int MAX>
-    struct Iterate
-    {
-        // BucketKeys
-        static __device__ __forceinline__ void BucketKeys(
-            AgentRadixSortUpsweep       &cta,
-            UnsignedBits                keys[KEYS_PER_THREAD])
-        {
-            cta.Bucket(keys[COUNT]);
-
-            // Next
-            Iterate<COUNT + 1, MAX>::BucketKeys(cta, keys);
-        }
-    };
-
-    // Terminate
-    template <int MAX>
-    struct Iterate<MAX, MAX>
-    {
-        // BucketKeys
-        static __device__ __forceinline__ void BucketKeys(AgentRadixSortUpsweep &/*cta*/, UnsignedBits /*keys*/[KEYS_PER_THREAD]) {}
-    };
-
-
-    //---------------------------------------------------------------------
-    // Utility methods
-    //---------------------------------------------------------------------
-
-    /**
-     * Decode a key and increment corresponding smem digit counter
-     */
-    __device__ __forceinline__ void Bucket(UnsignedBits key)
-    {
-        // Perform transform op
-        UnsignedBits converted_key = Traits<KeyT>::TwiddleIn(key);
-
-        // Extract current digit bits
-        UnsignedBits digit = BFE(converted_key, current_bit, num_bits);
-
-        // Get sub-counter offset
-        UnsignedBits sub_counter = digit & (PACKING_RATIO - 1);
-
-        // Get row offset
-        UnsignedBits row_offset = digit >> LOG_PACKING_RATIO;
-
-        // Increment counter
-        temp_storage.thread_counters[row_offset][threadIdx.x][sub_counter]++;
-    }
-
-
-    /**
-     * Reset composite counters
-     */
-    __device__ __forceinline__ void ResetDigitCounters()
-    {
-        #pragma unroll
-        for (int LANE = 0; LANE < COUNTER_LANES; LANE++)
-        {
-            temp_storage.packed_thread_counters[LANE][threadIdx.x] = 0;
-        }
-    }
-
-
-    /**
-     * Reset the unpacked counters in each thread
-     */
-    __device__ __forceinline__ void ResetUnpackedCounters()
-    {
-        #pragma unroll
-        for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
-        {
-            #pragma unroll
-            for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
-            {
-                local_counts[LANE][UNPACKED_COUNTER] = 0;
-            }
-        }
-    }
-
-
-    /**
-     * Extracts and aggregates the digit counters for each counter lane
-     * owned by this warp
-     */
-    __device__ __forceinline__ void UnpackDigitCounts()
-    {
-        unsigned int warp_id = threadIdx.x >> LOG_WARP_THREADS;
-        unsigned int warp_tid = LaneId();
-
-        #pragma unroll
-        for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
-        {
-            const int counter_lane = (LANE * WARPS) + warp_id;
-            if (counter_lane < COUNTER_LANES)
-            {
-                #pragma unroll
-                for (int PACKED_COUNTER = 0; PACKED_COUNTER < BLOCK_THREADS; PACKED_COUNTER += WARP_THREADS)
-                {
-                    #pragma unroll
-                    for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
-                    {
-                        OffsetT counter = temp_storage.thread_counters[counter_lane][warp_tid + PACKED_COUNTER][UNPACKED_COUNTER];
-                        local_counts[LANE][UNPACKED_COUNTER] += counter;
-                    }
-                }
-            }
-        }
-    }
-
-
-    /**
-     * Processes a single, full tile
-     */
-    __device__ __forceinline__ void ProcessFullTile(OffsetT block_offset)
-    {
-        // Tile of keys
-        UnsignedBits keys[KEYS_PER_THREAD];
-
-        LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_keys_in + block_offset, keys);
-
-        // Prevent hoisting
-        CTA_SYNC();
-
-        // Bucket tile of keys
-        Iterate<0, KEYS_PER_THREAD>::BucketKeys(*this, keys);
-    }
-
-
-    /**
-     * Processes a single load (may have some threads masked off)
-     */
-    __device__ __forceinline__ void ProcessPartialTile(
-        OffsetT block_offset,
-        const OffsetT &block_end)
-    {
-        // Process partial tile if necessary using single loads
-        block_offset += threadIdx.x;
-        while (block_offset < block_end)
-        {
-            // Load and bucket key
-            UnsignedBits key = d_keys_in[block_offset];
-            Bucket(key);
-            block_offset += BLOCK_THREADS;
-        }
-    }
-
-
-    //---------------------------------------------------------------------
-    // Interface
-    //---------------------------------------------------------------------
-
-    /**
-     * Constructor
-     */
-    __device__ __forceinline__ AgentRadixSortUpsweep(
-        TempStorage &temp_storage,
-        const KeyT  *d_keys_in,
-        int         current_bit,
-        int         num_bits)
-    :
-        temp_storage(temp_storage.Alias()),
-        d_keys_in(reinterpret_cast<const UnsignedBits*>(d_keys_in)),
-        current_bit(current_bit),
-        num_bits(num_bits)
-    {}
-
-
-    /**
-     * Compute radix digit histograms from a segment of input tiles.
-     */
-    __device__ __forceinline__ void ProcessRegion(
-        OffsetT          block_offset,
-        const OffsetT    &block_end)
-    {
-        // Reset digit counters in smem and unpacked counters in registers
-        ResetDigitCounters();
-        ResetUnpackedCounters();
-
-        // Unroll batches of full tiles
-        while (block_offset + UNROLLED_ELEMENTS <= block_end)
-        {
-            for (int i = 0; i < UNROLL_COUNT; ++i)
-            {
-                ProcessFullTile(block_offset);
-                block_offset += TILE_ITEMS;
-            }
-
-            CTA_SYNC();
-
-            // Aggregate back into local_count registers to prevent overflow
-            UnpackDigitCounts();
-
-            CTA_SYNC();
-
-            // Reset composite counters in lanes
-            ResetDigitCounters();
-        }
-
-        // Unroll single full tiles
-        while (block_offset + TILE_ITEMS <= block_end)
-        {
-            ProcessFullTile(block_offset);
-            block_offset += TILE_ITEMS;
-        }
-
-        // Process partial tile if necessary
-        ProcessPartialTile(
-            block_offset,
-            block_end);
-
-        CTA_SYNC();
-
-        // Aggregate back into local_count registers
-        UnpackDigitCounts();
-    }
-
-
-    /**
-     * Extract counts (saving them to the external array)
-     */
-    template <bool IS_DESCENDING>
-    __device__ __forceinline__ void ExtractCounts(
-        OffsetT     *counters,
-        int         bin_stride = 1,
-        int         bin_offset = 0)
-    {
-        unsigned int warp_id    = threadIdx.x >> LOG_WARP_THREADS;
-        unsigned int warp_tid   = LaneId();
-
-        // Place unpacked digit counters in shared memory
-        #pragma unroll
-        for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
-        {
-            int counter_lane = (LANE * WARPS) + warp_id;
-            if (counter_lane < COUNTER_LANES)
-            {
-                int digit_row = counter_lane << LOG_PACKING_RATIO;
-
-                #pragma unroll
-                for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
-                {
-                    int bin_idx = digit_row + UNPACKED_COUNTER;
-
-                    temp_storage.block_counters[warp_tid][bin_idx] =
-                        local_counts[LANE][UNPACKED_COUNTER];
-                }
-            }
-        }
-
-        CTA_SYNC();
-
-        // Rake-reduce bin_count reductions
-
-        // Whole blocks
-        #pragma unroll
-        for (int BIN_BASE   = RADIX_DIGITS % BLOCK_THREADS;
-            (BIN_BASE + BLOCK_THREADS) <= RADIX_DIGITS;
-            BIN_BASE += BLOCK_THREADS)
-        {
-            int bin_idx = BIN_BASE + threadIdx.x;
-
-            OffsetT bin_count = 0;
-            #pragma unroll
-            for (int i = 0; i < WARP_THREADS; ++i)
-                bin_count += temp_storage.block_counters[i][bin_idx];
-
-            if (IS_DESCENDING)
-                bin_idx = RADIX_DIGITS - bin_idx - 1;
-
-            counters[(bin_stride * bin_idx) + bin_offset] = bin_count;
-        }
-
-        // Remainder
-        if ((RADIX_DIGITS % BLOCK_THREADS != 0) && (threadIdx.x < RADIX_DIGITS))
-        {
-            int bin_idx = threadIdx.x;
-
-            OffsetT bin_count = 0;
-            #pragma unroll
-            for (int i = 0; i < WARP_THREADS; ++i)
-                bin_count += temp_storage.block_counters[i][bin_idx];
-
-            if (IS_DESCENDING)
-                bin_idx = RADIX_DIGITS - bin_idx - 1;
-
-            counters[(bin_stride * bin_idx) + bin_offset] = bin_count;
-        }
-    }
-
-
-    /**
-     * Extract counts
-     */
-    template <int BINS_TRACKED_PER_THREAD>
-    __device__ __forceinline__ void ExtractCounts(
-        OffsetT (&bin_count)[BINS_TRACKED_PER_THREAD])  ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1]
-    {
-        unsigned int warp_id    = threadIdx.x >> LOG_WARP_THREADS;
-        unsigned int warp_tid   = LaneId();
-
-        // Place unpacked digit counters in shared memory
-        #pragma unroll
-        for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
-        {
-            int counter_lane = (LANE * WARPS) + warp_id;
-            if (counter_lane < COUNTER_LANES)
-            {
-                int digit_row = counter_lane << LOG_PACKING_RATIO;
-
-                #pragma unroll
-                for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
-                {
-                    int bin_idx = digit_row + UNPACKED_COUNTER;
-
-                    temp_storage.block_counters[warp_tid][bin_idx] =
-                        local_counts[LANE][UNPACKED_COUNTER];
-                }
-            }
-        }
-
-        CTA_SYNC();
-
-        // Rake-reduce bin_count reductions
-        #pragma unroll
-        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
-        {
-            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
-
-            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
-            {
-                bin_count[track] = 0;
-
-                #pragma unroll
-                for (int i = 0; i < WARP_THREADS; ++i)
-                    bin_count[track] += temp_storage.block_counters[i][bin_idx];
-            }
-        }
-    }
-
-};
-
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/agent/agent_reduce.cuh b/thrust/system/cuda/detail/cub/agent/agent_reduce.cuh
deleted file mode 100644
index df3f4a70f..000000000
--- a/thrust/system/cuda/detail/cub/agent/agent_reduce.cuh
+++ /dev/null
@@ -1,385 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::AgentReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction .
- */
-
-#pragma once
-
-#include <iterator>
-
-#include "../block/block_load.cuh"
-#include "../block/block_reduce.cuh"
-#include "../grid/grid_mapping.cuh"
-#include "../grid/grid_even_share.cuh"
-#include "../util_type.cuh"
-#include "../iterator/cache_modified_input_iterator.cuh"
-#include "../util_namespace.cuh"
-
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Tuning policy types
- ******************************************************************************/
-
-/**
- * Parameterizable tuning policy type for AgentReduce
- */
-template <
-    int                     _BLOCK_THREADS,         ///< Threads per thread block
-    int                     _ITEMS_PER_THREAD,      ///< Items per thread (per tile of input)
-    int                     _VECTOR_LOAD_LENGTH,    ///< Number of items per vectorized load
-    BlockReduceAlgorithm    _BLOCK_ALGORITHM,       ///< Cooperative block-wide reduction algorithm to use
-    CacheLoadModifier       _LOAD_MODIFIER>         ///< Cache load modifier for reading input elements
-struct AgentReducePolicy
-{
-    enum
-    {
-        BLOCK_THREADS       = _BLOCK_THREADS,       ///< Threads per thread block
-        ITEMS_PER_THREAD    = _ITEMS_PER_THREAD,    ///< Items per thread (per tile of input)
-        VECTOR_LOAD_LENGTH  = _VECTOR_LOAD_LENGTH,  ///< Number of items per vectorized load
-    };
-
-    static const BlockReduceAlgorithm  BLOCK_ALGORITHM      = _BLOCK_ALGORITHM;     ///< Cooperative block-wide reduction algorithm to use
-    static const CacheLoadModifier     LOAD_MODIFIER        = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
-};
-
-
-
-/******************************************************************************
- * Thread block abstractions
- ******************************************************************************/
-
-/**
- * \brief AgentReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction .
- *
- * Each thread reduces only the values it loads. If \p FIRST_TILE, this
- * partial reduction is stored into \p thread_aggregate.  Otherwise it is
- * accumulated into \p thread_aggregate.
- */
-template <
-    typename AgentReducePolicy,        ///< Parameterized AgentReducePolicy tuning policy type
-    typename InputIteratorT,           ///< Random-access iterator type for input
-    typename OutputIteratorT,          ///< Random-access iterator type for output
-    typename OffsetT,                  ///< Signed integer type for global offsets
-    typename ReductionOp>              ///< Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-struct AgentReduce
-{
-
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    /// The input value type
-    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
-
-    /// The output value type
-    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
-        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
-
-    /// Vector type of InputT for data movement
-    typedef typename CubVector<InputT, AgentReducePolicy::VECTOR_LOAD_LENGTH>::Type VectorT;
-
-    /// Input iterator wrapper type (for applying cache modifier)
-    typedef typename If<IsPointer<InputIteratorT>::VALUE,
-            CacheModifiedInputIterator<AgentReducePolicy::LOAD_MODIFIER, InputT, OffsetT>,      // Wrap the native input pointer with CacheModifiedInputIterator
-            InputIteratorT>::Type                                                               // Directly use the supplied input iterator type
-        WrappedInputIteratorT;
-
-    /// Constants
-    enum
-    {
-        BLOCK_THREADS       = AgentReducePolicy::BLOCK_THREADS,
-        ITEMS_PER_THREAD    = AgentReducePolicy::ITEMS_PER_THREAD,
-        VECTOR_LOAD_LENGTH  = CUB_MIN(ITEMS_PER_THREAD, AgentReducePolicy::VECTOR_LOAD_LENGTH),
-        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
-
-        // Can vectorize according to the policy if the input iterator is a native pointer to a primitive type
-        ATTEMPT_VECTORIZATION   = (VECTOR_LOAD_LENGTH > 1) &&
-                                    (ITEMS_PER_THREAD % VECTOR_LOAD_LENGTH == 0) &&
-                                    (IsPointer<InputIteratorT>::VALUE) && Traits<InputT>::PRIMITIVE,
-
-    };
-
-    static const CacheLoadModifier    LOAD_MODIFIER   = AgentReducePolicy::LOAD_MODIFIER;
-    static const BlockReduceAlgorithm BLOCK_ALGORITHM = AgentReducePolicy::BLOCK_ALGORITHM;
-
-    /// Parameterized BlockReduce primitive
-    typedef BlockReduce<OutputT, BLOCK_THREADS, AgentReducePolicy::BLOCK_ALGORITHM> BlockReduceT;
-
-    /// Shared memory type required by this thread block
-    struct _TempStorage
-    {
-        typename BlockReduceT::TempStorage  reduce;
-    };
-
-    /// Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    //---------------------------------------------------------------------
-    // Per-thread fields
-    //---------------------------------------------------------------------
-
-    _TempStorage&           temp_storage;       ///< Reference to temp_storage
-    InputIteratorT          d_in;               ///< Input data to reduce
-    WrappedInputIteratorT   d_wrapped_in;       ///< Wrapped input data to reduce
-    ReductionOp             reduction_op;       ///< Binary reduction operator
-
-
-    //---------------------------------------------------------------------
-    // Utility
-    //---------------------------------------------------------------------
-
-
-    // Whether or not the input is aligned with the vector type (specialized for types we can vectorize)
-    template <typename Iterator>
-    static __device__ __forceinline__ bool IsAligned(
-        Iterator        d_in,
-        Int2Type<true>  /*can_vectorize*/)
-    {
-        return (size_t(d_in) & (sizeof(VectorT) - 1)) == 0;
-    }
-
-    // Whether or not the input is aligned with the vector type (specialized for types we cannot vectorize)
-    template <typename Iterator>
-    static __device__ __forceinline__ bool IsAligned(
-        Iterator        /*d_in*/,
-        Int2Type<false> /*can_vectorize*/)
-    {
-        return false;
-    }
-
-
-    //---------------------------------------------------------------------
-    // Constructor
-    //---------------------------------------------------------------------
-
-    /**
-     * Constructor
-     */
-    __device__ __forceinline__ AgentReduce(
-        TempStorage&            temp_storage,       ///< Reference to temp_storage
-        InputIteratorT          d_in,               ///< Input data to reduce
-        ReductionOp             reduction_op)       ///< Binary reduction operator
-    :
-        temp_storage(temp_storage.Alias()),
-        d_in(d_in),
-        d_wrapped_in(d_in),
-        reduction_op(reduction_op)
-    {}
-
-
-    //---------------------------------------------------------------------
-    // Tile consumption
-    //---------------------------------------------------------------------
-
-    /**
-     * Consume a full tile of input (non-vectorized)
-     */
-    template <int IS_FIRST_TILE>
-    __device__ __forceinline__ void ConsumeTile(
-        OutputT                 &thread_aggregate,
-        OffsetT                 block_offset,       ///< The offset the tile to consume
-        int                     /*valid_items*/,    ///< The number of valid items in the tile
-        Int2Type<true>          /*is_full_tile*/,   ///< Whether or not this is a full tile
-        Int2Type<false>         /*can_vectorize*/)  ///< Whether or not we can vectorize loads
-    {
-        OutputT items[ITEMS_PER_THREAD];
-
-        // Load items in striped fashion
-        LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_wrapped_in + block_offset, items);
-
-        // Reduce items within each thread stripe
-        thread_aggregate = (IS_FIRST_TILE) ?
-            internal::ThreadReduce(items, reduction_op) :
-            internal::ThreadReduce(items, reduction_op, thread_aggregate);
-    }
-
-
-    /**
-     * Consume a full tile of input (vectorized)
-     */
-    template <int IS_FIRST_TILE>
-    __device__ __forceinline__ void ConsumeTile(
-        OutputT                 &thread_aggregate,
-        OffsetT                 block_offset,       ///< The offset the tile to consume
-        int                     /*valid_items*/,    ///< The number of valid items in the tile
-        Int2Type<true>          /*is_full_tile*/,   ///< Whether or not this is a full tile
-        Int2Type<true>          /*can_vectorize*/)  ///< Whether or not we can vectorize loads
-    {
-        // Alias items as an array of VectorT and load it in striped fashion
-        enum { WORDS =  ITEMS_PER_THREAD / VECTOR_LOAD_LENGTH };
-
-        // Fabricate a vectorized input iterator
-        InputT *d_in_unqualified = const_cast<InputT*>(d_in) + block_offset + (threadIdx.x * VECTOR_LOAD_LENGTH);
-        CacheModifiedInputIterator<AgentReducePolicy::LOAD_MODIFIER, VectorT, OffsetT> d_vec_in(
-            reinterpret_cast<VectorT*>(d_in_unqualified));
-
-        // Load items as vector items
-        InputT input_items[ITEMS_PER_THREAD];
-        VectorT *vec_items = reinterpret_cast<VectorT*>(input_items);
-        #pragma unroll
-        for (int i = 0; i < WORDS; ++i)
-            vec_items[i] = d_vec_in[BLOCK_THREADS * i];
-
-        // Convert from input type to output type
-        OutputT items[ITEMS_PER_THREAD];
-        #pragma unroll
-        for (int i = 0; i < ITEMS_PER_THREAD; ++i)
-            items[i] = input_items[i];
-
-        // Reduce items within each thread stripe
-        thread_aggregate = (IS_FIRST_TILE) ?
-            internal::ThreadReduce(items, reduction_op) :
-            internal::ThreadReduce(items, reduction_op, thread_aggregate);
-    }
-
-
-    /**
-     * Consume a partial tile of input
-     */
-    template <int IS_FIRST_TILE, int CAN_VECTORIZE>
-    __device__ __forceinline__ void ConsumeTile(
-        OutputT                 &thread_aggregate,
-        OffsetT                 block_offset,       ///< The offset the tile to consume
-        int                     valid_items,        ///< The number of valid items in the tile
-        Int2Type<false>         /*is_full_tile*/,   ///< Whether or not this is a full tile
-        Int2Type<CAN_VECTORIZE> /*can_vectorize*/)  ///< Whether or not we can vectorize loads
-    {
-        // Partial tile
-        int thread_offset = threadIdx.x;
-
-        // Read first item
-        if ((IS_FIRST_TILE) && (thread_offset < valid_items))
-        {
-            thread_aggregate = d_wrapped_in[block_offset + thread_offset];
-            thread_offset += BLOCK_THREADS;
-        }
-
-        // Continue reading items (block-striped)
-        while (thread_offset < valid_items)
-        {
-            OutputT item        (d_wrapped_in[block_offset + thread_offset]);
-            thread_aggregate    = reduction_op(thread_aggregate, item);
-            thread_offset       += BLOCK_THREADS;
-        }
-    }
-
-
-    //---------------------------------------------------------------
-    // Consume a contiguous segment of tiles
-    //---------------------------------------------------------------------
-
-    /**
-     * \brief Reduce a contiguous segment of input tiles
-     */
-    template <int CAN_VECTORIZE>
-    __device__ __forceinline__ OutputT ConsumeRange(
-        GridEvenShare<OffsetT> &even_share,          ///< GridEvenShare descriptor
-        Int2Type<CAN_VECTORIZE> can_vectorize)      ///< Whether or not we can vectorize loads
-    {
-        OutputT thread_aggregate;
-
-        if (even_share.block_offset + TILE_ITEMS > even_share.block_end)
-        {
-            // First tile isn't full (not all threads have valid items)
-            int valid_items = even_share.block_end - even_share.block_offset;
-            ConsumeTile<true>(thread_aggregate, even_share.block_offset, valid_items, Int2Type<false>(), can_vectorize);
-            return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op, valid_items);
-        }
-
-        // At least one full block
-        ConsumeTile<true>(thread_aggregate, even_share.block_offset, TILE_ITEMS, Int2Type<true>(), can_vectorize);
-        even_share.block_offset += even_share.block_stride;
-
-        // Consume subsequent full tiles of input
-        while (even_share.block_offset + TILE_ITEMS <= even_share.block_end)
-        {
-            ConsumeTile<false>(thread_aggregate, even_share.block_offset, TILE_ITEMS, Int2Type<true>(), can_vectorize);
-            even_share.block_offset += even_share.block_stride;
-        }
-
-        // Consume a partially-full tile
-        if (even_share.block_offset < even_share.block_end)
-        {
-            int valid_items = even_share.block_end - even_share.block_offset;
-            ConsumeTile<false>(thread_aggregate, even_share.block_offset, valid_items, Int2Type<false>(), can_vectorize);
-        }
-
-        // Compute block-wide reduction (all threads have valid items)
-        return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op);
-    }
-
-
-    /**
-     * \brief Reduce a contiguous segment of input tiles
-     */
-    __device__ __forceinline__ OutputT ConsumeRange(
-        OffsetT block_offset,                       ///< [in] Threadblock begin offset (inclusive)
-        OffsetT block_end)                          ///< [in] Threadblock end offset (exclusive)
-    {
-        GridEvenShare<OffsetT> even_share;
-        even_share.template BlockInit<TILE_ITEMS>(block_offset, block_end);
-
-        return (IsAligned(d_in + block_offset, Int2Type<ATTEMPT_VECTORIZATION>())) ?
-            ConsumeRange(even_share, Int2Type<true && ATTEMPT_VECTORIZATION>()) :
-            ConsumeRange(even_share, Int2Type<false && ATTEMPT_VECTORIZATION>());
-    }
-
-
-    /**
-     * Reduce a contiguous segment of input tiles
-     */
-    __device__ __forceinline__ OutputT ConsumeTiles(
-        GridEvenShare<OffsetT> &even_share)        ///< [in] GridEvenShare descriptor
-    {
-        // Initialize GRID_MAPPING_STRIP_MINE even-share descriptor for this thread block
-        even_share.template BlockInit<TILE_ITEMS, GRID_MAPPING_STRIP_MINE>();
-
-        return (IsAligned(d_in, Int2Type<ATTEMPT_VECTORIZATION>())) ?
-            ConsumeRange(even_share, Int2Type<true && ATTEMPT_VECTORIZATION>()) :
-            ConsumeRange(even_share, Int2Type<false && ATTEMPT_VECTORIZATION>());
-
-    }
-
-};
-
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/agent/agent_reduce_by_key.cuh b/thrust/system/cuda/detail/cub/agent/agent_reduce_by_key.cuh
deleted file mode 100644
index d68201013..000000000
--- a/thrust/system/cuda/detail/cub/agent/agent_reduce_by_key.cuh
+++ /dev/null
@@ -1,547 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::AgentReduceByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key.
- */
-
-#pragma once
-
-#include <iterator>
-
-#include "single_pass_scan_operators.cuh"
-#include "../block/block_load.cuh"
-#include "../block/block_store.cuh"
-#include "../block/block_scan.cuh"
-#include "../block/block_discontinuity.cuh"
-#include "../iterator/cache_modified_input_iterator.cuh"
-#include "../iterator/constant_input_iterator.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Tuning policy types
- ******************************************************************************/
-
-/**
- * Parameterizable tuning policy type for AgentReduceByKey
- */
-template <
-    int                         _BLOCK_THREADS,                 ///< Threads per thread block
-    int                         _ITEMS_PER_THREAD,              ///< Items per thread (per tile of input)
-    BlockLoadAlgorithm          _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
-    CacheLoadModifier           _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
-    BlockScanAlgorithm          _SCAN_ALGORITHM>                ///< The BlockScan algorithm to use
-struct AgentReduceByKeyPolicy
-{
-    enum
-    {
-        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
-        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
-    };
-
-    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;      ///< The BlockLoad algorithm to use
-    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
-    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;      ///< The BlockScan algorithm to use
-};
-
-
-/******************************************************************************
- * Thread block abstractions
- ******************************************************************************/
-
-/**
- * \brief AgentReduceByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key
- */
-template <
-    typename    AgentReduceByKeyPolicyT,        ///< Parameterized AgentReduceByKeyPolicy tuning policy type
-    typename    KeysInputIteratorT,             ///< Random-access input iterator type for keys
-    typename    UniqueOutputIteratorT,          ///< Random-access output iterator type for keys
-    typename    ValuesInputIteratorT,           ///< Random-access input iterator type for values
-    typename    AggregatesOutputIteratorT,      ///< Random-access output iterator type for values
-    typename    NumRunsOutputIteratorT,         ///< Output iterator type for recording number of items selected
-    typename    EqualityOpT,                    ///< KeyT equality operator type
-    typename    ReductionOpT,                   ///< ValueT reduction operator type
-    typename    OffsetT>                        ///< Signed integer type for global offsets
-struct AgentReduceByKey
-{
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    // The input keys type
-    typedef typename std::iterator_traits<KeysInputIteratorT>::value_type KeyInputT;
-
-    // The output keys type
-    typedef typename If<(Equals<typename std::iterator_traits<UniqueOutputIteratorT>::value_type, void>::VALUE),    // KeyOutputT =  (if output iterator's value type is void) ?
-        typename std::iterator_traits<KeysInputIteratorT>::value_type,                                              // ... then the input iterator's value type,
-        typename std::iterator_traits<UniqueOutputIteratorT>::value_type>::Type KeyOutputT;                         // ... else the output iterator's value type
-
-    // The input values type
-    typedef typename std::iterator_traits<ValuesInputIteratorT>::value_type ValueInputT;
-
-    // The output values type
-    typedef typename If<(Equals<typename std::iterator_traits<AggregatesOutputIteratorT>::value_type, void>::VALUE),    // ValueOutputT =  (if output iterator's value type is void) ?
-        typename std::iterator_traits<ValuesInputIteratorT>::value_type,                                                // ... then the input iterator's value type,
-        typename std::iterator_traits<AggregatesOutputIteratorT>::value_type>::Type ValueOutputT;                       // ... else the output iterator's value type
-
-    // Tuple type for scanning (pairs accumulated segment-value with segment-index)
-    typedef KeyValuePair<OffsetT, ValueOutputT> OffsetValuePairT;
-
-    // Tuple type for pairing keys and values
-    typedef KeyValuePair<KeyOutputT, ValueOutputT> KeyValuePairT;
-
-    // Tile status descriptor interface type
-    typedef ReduceByKeyScanTileState<ValueOutputT, OffsetT> ScanTileStateT;
-
-    // Guarded inequality functor
-    template <typename _EqualityOpT>
-    struct GuardedInequalityWrapper
-    {
-        _EqualityOpT     op;             ///< Wrapped equality operator
-        int             num_remaining;  ///< Items remaining
-
-        /// Constructor
-        __host__ __device__ __forceinline__
-        GuardedInequalityWrapper(_EqualityOpT op, int num_remaining) : op(op), num_remaining(num_remaining) {}
-
-        /// Boolean inequality operator, returns <tt>(a != b)</tt>
-        template <typename T>
-        __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b, int idx) const
-        {
-            if (idx < num_remaining)
-                return !op(a, b);   // In bounds
-
-            // Return true if first out-of-bounds item, false otherwise
-            return (idx == num_remaining);
-       }
-    };
-
-
-    // Constants
-    enum
-    {
-        BLOCK_THREADS       = AgentReduceByKeyPolicyT::BLOCK_THREADS,
-        ITEMS_PER_THREAD    = AgentReduceByKeyPolicyT::ITEMS_PER_THREAD,
-        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
-        TWO_PHASE_SCATTER   = (ITEMS_PER_THREAD > 1),
-
-        // Whether or not the scan operation has a zero-valued identity value (true if we're performing addition on a primitive type)
-        HAS_IDENTITY_ZERO   = (Equals<ReductionOpT, cub::Sum>::VALUE) && (Traits<ValueOutputT>::PRIMITIVE),
-    };
-
-    // Cache-modified Input iterator wrapper type (for applying cache modifier) for keys
-    typedef typename If<IsPointer<KeysInputIteratorT>::VALUE,
-            CacheModifiedInputIterator<AgentReduceByKeyPolicyT::LOAD_MODIFIER, KeyInputT, OffsetT>,     // Wrap the native input pointer with CacheModifiedValuesInputIterator
-            KeysInputIteratorT>::Type                                                                   // Directly use the supplied input iterator type
-        WrappedKeysInputIteratorT;
-
-    // Cache-modified Input iterator wrapper type (for applying cache modifier) for values
-    typedef typename If<IsPointer<ValuesInputIteratorT>::VALUE,
-            CacheModifiedInputIterator<AgentReduceByKeyPolicyT::LOAD_MODIFIER, ValueInputT, OffsetT>,   // Wrap the native input pointer with CacheModifiedValuesInputIterator
-            ValuesInputIteratorT>::Type                                                                 // Directly use the supplied input iterator type
-        WrappedValuesInputIteratorT;
-
-    // Cache-modified Input iterator wrapper type (for applying cache modifier) for fixup values
-    typedef typename If<IsPointer<AggregatesOutputIteratorT>::VALUE,
-            CacheModifiedInputIterator<AgentReduceByKeyPolicyT::LOAD_MODIFIER, ValueInputT, OffsetT>,   // Wrap the native input pointer with CacheModifiedValuesInputIterator
-            AggregatesOutputIteratorT>::Type                                                            // Directly use the supplied input iterator type
-        WrappedFixupInputIteratorT;
-
-    // Reduce-value-by-segment scan operator
-    typedef ReduceBySegmentOp<ReductionOpT> ReduceBySegmentOpT;
-
-    // Parameterized BlockLoad type for keys
-    typedef BlockLoad<
-            KeyOutputT,
-            BLOCK_THREADS,
-            ITEMS_PER_THREAD,
-            AgentReduceByKeyPolicyT::LOAD_ALGORITHM>
-        BlockLoadKeysT;
-
-    // Parameterized BlockLoad type for values
-    typedef BlockLoad<
-            ValueOutputT,
-            BLOCK_THREADS,
-            ITEMS_PER_THREAD,
-            AgentReduceByKeyPolicyT::LOAD_ALGORITHM>
-        BlockLoadValuesT;
-
-    // Parameterized BlockDiscontinuity type for keys
-    typedef BlockDiscontinuity<
-            KeyOutputT,
-            BLOCK_THREADS>
-        BlockDiscontinuityKeys;
-
-    // Parameterized BlockScan type
-    typedef BlockScan<
-            OffsetValuePairT,
-            BLOCK_THREADS,
-            AgentReduceByKeyPolicyT::SCAN_ALGORITHM>
-        BlockScanT;
-
-    // Callback type for obtaining tile prefix during block scan
-    typedef TilePrefixCallbackOp<
-            OffsetValuePairT,
-            ReduceBySegmentOpT,
-            ScanTileStateT>
-        TilePrefixCallbackOpT;
-
-    // Key and value exchange types
-    typedef KeyOutputT    KeyExchangeT[TILE_ITEMS + 1];
-    typedef ValueOutputT  ValueExchangeT[TILE_ITEMS + 1];
-
-    // Shared memory type for this thread block
-    union _TempStorage
-    {
-        struct
-        {
-            typename BlockScanT::TempStorage                scan;           // Smem needed for tile scanning
-            typename TilePrefixCallbackOpT::TempStorage     prefix;         // Smem needed for cooperative prefix callback
-            typename BlockDiscontinuityKeys::TempStorage    discontinuity;  // Smem needed for discontinuity detection
-        };
-
-        // Smem needed for loading keys
-        typename BlockLoadKeysT::TempStorage load_keys;
-
-        // Smem needed for loading values
-        typename BlockLoadValuesT::TempStorage load_values;
-
-        // Smem needed for compacting key value pairs(allows non POD items in this union)
-        Uninitialized<KeyValuePairT[TILE_ITEMS + 1]> raw_exchange;
-    };
-
-    // Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    //---------------------------------------------------------------------
-    // Per-thread fields
-    //---------------------------------------------------------------------
-
-    _TempStorage&                   temp_storage;       ///< Reference to temp_storage
-    WrappedKeysInputIteratorT       d_keys_in;          ///< Input keys
-    UniqueOutputIteratorT           d_unique_out;       ///< Unique output keys
-    WrappedValuesInputIteratorT     d_values_in;        ///< Input values
-    AggregatesOutputIteratorT       d_aggregates_out;   ///< Output value aggregates
-    NumRunsOutputIteratorT          d_num_runs_out;     ///< Output pointer for total number of segments identified
-    EqualityOpT                     equality_op;        ///< KeyT equality operator
-    ReductionOpT                    reduction_op;       ///< Reduction operator
-    ReduceBySegmentOpT              scan_op;            ///< Reduce-by-segment scan operator
-
-
-    //---------------------------------------------------------------------
-    // Constructor
-    //---------------------------------------------------------------------
-
-    // Constructor
-    __device__ __forceinline__
-    AgentReduceByKey(
-        TempStorage&                temp_storage,       ///< Reference to temp_storage
-        KeysInputIteratorT          d_keys_in,          ///< Input keys
-        UniqueOutputIteratorT       d_unique_out,       ///< Unique output keys
-        ValuesInputIteratorT        d_values_in,        ///< Input values
-        AggregatesOutputIteratorT   d_aggregates_out,   ///< Output value aggregates
-        NumRunsOutputIteratorT      d_num_runs_out,     ///< Output pointer for total number of segments identified
-        EqualityOpT                 equality_op,        ///< KeyT equality operator
-        ReductionOpT                reduction_op)       ///< ValueT reduction operator
-    :
-        temp_storage(temp_storage.Alias()),
-        d_keys_in(d_keys_in),
-        d_unique_out(d_unique_out),
-        d_values_in(d_values_in),
-        d_aggregates_out(d_aggregates_out),
-        d_num_runs_out(d_num_runs_out),
-        equality_op(equality_op),
-        reduction_op(reduction_op),
-        scan_op(reduction_op)
-    {}
-
-
-    //---------------------------------------------------------------------
-    // Scatter utility methods
-    //---------------------------------------------------------------------
-
-    /**
-     * Directly scatter flagged items to output offsets
-     */
-    __device__ __forceinline__ void ScatterDirect(
-        KeyValuePairT   (&scatter_items)[ITEMS_PER_THREAD],
-        OffsetT         (&segment_flags)[ITEMS_PER_THREAD],
-        OffsetT         (&segment_indices)[ITEMS_PER_THREAD])
-    {
-        // Scatter flagged keys and values
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            if (segment_flags[ITEM])
-            {
-                d_unique_out[segment_indices[ITEM]]     = scatter_items[ITEM].key;
-                d_aggregates_out[segment_indices[ITEM]] = scatter_items[ITEM].value;
-            }
-        }
-    }
-
-
-    /**
-     * 2-phase scatter flagged items to output offsets
-     *
-     * The exclusive scan causes each head flag to be paired with the previous
-     * value aggregate: the scatter offsets must be decremented for value aggregates
-     */
-    __device__ __forceinline__ void ScatterTwoPhase(
-        KeyValuePairT   (&scatter_items)[ITEMS_PER_THREAD],
-        OffsetT         (&segment_flags)[ITEMS_PER_THREAD],
-        OffsetT         (&segment_indices)[ITEMS_PER_THREAD],
-        OffsetT         num_tile_segments,
-        OffsetT         num_tile_segments_prefix)
-    {
-        CTA_SYNC();
-
-        // Compact and scatter pairs
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            if (segment_flags[ITEM])
-            {
-                temp_storage.raw_exchange.Alias()[segment_indices[ITEM] - num_tile_segments_prefix] = scatter_items[ITEM];
-            }
-        }
-
-        CTA_SYNC();
-
-        for (int item = threadIdx.x; item < num_tile_segments; item += BLOCK_THREADS)
-        {
-            KeyValuePairT pair                                  = temp_storage.raw_exchange.Alias()[item];
-            d_unique_out[num_tile_segments_prefix + item]       = pair.key;
-            d_aggregates_out[num_tile_segments_prefix + item]   = pair.value;
-        }
-    }
-
-
-    /**
-     * Scatter flagged items
-     */
-    __device__ __forceinline__ void Scatter(
-        KeyValuePairT   (&scatter_items)[ITEMS_PER_THREAD],
-        OffsetT         (&segment_flags)[ITEMS_PER_THREAD],
-        OffsetT         (&segment_indices)[ITEMS_PER_THREAD],
-        OffsetT         num_tile_segments,
-        OffsetT         num_tile_segments_prefix)
-    {
-        // Do a one-phase scatter if (a) two-phase is disabled or (b) the average number of selected items per thread is less than one
-        if (TWO_PHASE_SCATTER && (num_tile_segments > BLOCK_THREADS))
-        {
-            ScatterTwoPhase(
-                scatter_items,
-                segment_flags,
-                segment_indices,
-                num_tile_segments,
-                num_tile_segments_prefix);
-        }
-        else
-        {
-            ScatterDirect(
-                scatter_items,
-                segment_flags,
-                segment_indices);
-        }
-    }
-
-
-    //---------------------------------------------------------------------
-    // Cooperatively scan a device-wide sequence of tiles with other CTAs
-    //---------------------------------------------------------------------
-
-    /**
-     * Process a tile of input (dynamic chained scan)
-     */
-    template <bool IS_LAST_TILE>                ///< Whether the current tile is the last tile
-    __device__ __forceinline__ void ConsumeTile(
-        OffsetT             num_remaining,      ///< Number of global input items remaining (including this tile)
-        int                 tile_idx,           ///< Tile index
-        OffsetT             tile_offset,        ///< Tile offset
-        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
-    {
-        KeyOutputT          keys[ITEMS_PER_THREAD];             // Tile keys
-        KeyOutputT          prev_keys[ITEMS_PER_THREAD];        // Tile keys shuffled up
-        ValueOutputT        values[ITEMS_PER_THREAD];           // Tile values
-        OffsetT             head_flags[ITEMS_PER_THREAD];       // Segment head flags
-        OffsetT             segment_indices[ITEMS_PER_THREAD];  // Segment indices
-        OffsetValuePairT    scan_items[ITEMS_PER_THREAD];       // Zipped values and segment flags|indices
-        KeyValuePairT       scatter_items[ITEMS_PER_THREAD];    // Zipped key value pairs for scattering
-
-        // Load keys
-        if (IS_LAST_TILE)
-            BlockLoadKeysT(temp_storage.load_keys).Load(d_keys_in + tile_offset, keys, num_remaining);
-        else
-            BlockLoadKeysT(temp_storage.load_keys).Load(d_keys_in + tile_offset, keys);
-
-        // Load tile predecessor key in first thread
-        KeyOutputT tile_predecessor;
-        if (threadIdx.x == 0)
-        {
-            tile_predecessor = (tile_idx == 0) ?
-                keys[0] :                       // First tile gets repeat of first item (thus first item will not be flagged as a head)
-                d_keys_in[tile_offset - 1];     // Subsequent tiles get last key from previous tile
-        }
-
-        CTA_SYNC();
-
-        // Load values
-        if (IS_LAST_TILE)
-            BlockLoadValuesT(temp_storage.load_values).Load(d_values_in + tile_offset, values, num_remaining);
-        else
-            BlockLoadValuesT(temp_storage.load_values).Load(d_values_in + tile_offset, values);
-
-        CTA_SYNC();
-
-        // Initialize head-flags and shuffle up the previous keys
-        if (IS_LAST_TILE)
-        {
-            // Use custom flag operator to additionally flag the first out-of-bounds item
-            GuardedInequalityWrapper<EqualityOpT> flag_op(equality_op, num_remaining);
-            BlockDiscontinuityKeys(temp_storage.discontinuity).FlagHeads(
-                head_flags, keys, prev_keys, flag_op, tile_predecessor);
-        }
-        else
-        {
-            InequalityWrapper<EqualityOpT> flag_op(equality_op);
-            BlockDiscontinuityKeys(temp_storage.discontinuity).FlagHeads(
-                head_flags, keys, prev_keys, flag_op, tile_predecessor);
-        }
-
-        // Zip values and head flags
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            scan_items[ITEM].value  = values[ITEM];
-            scan_items[ITEM].key    = head_flags[ITEM];
-        }
-
-        // Perform exclusive tile scan
-        OffsetValuePairT    block_aggregate;        // Inclusive block-wide scan aggregate
-        OffsetT             num_segments_prefix;    // Number of segments prior to this tile
-        OffsetValuePairT    total_aggregate;        // The tile prefix folded with block_aggregate
-        if (tile_idx == 0)
-        {
-            // Scan first tile
-            BlockScanT(temp_storage.scan).ExclusiveScan(scan_items, scan_items, scan_op, block_aggregate);
-            num_segments_prefix     = 0;
-            total_aggregate         = block_aggregate;
-
-            // Update tile status if there are successor tiles
-            if ((!IS_LAST_TILE) && (threadIdx.x == 0))
-                tile_state.SetInclusive(0, block_aggregate);
-        }
-        else
-        {
-            // Scan non-first tile
-            TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, scan_op, tile_idx);
-            BlockScanT(temp_storage.scan).ExclusiveScan(scan_items, scan_items, scan_op, prefix_op);
-
-            block_aggregate         = prefix_op.GetBlockAggregate();
-            num_segments_prefix     = prefix_op.GetExclusivePrefix().key;
-            total_aggregate         = prefix_op.GetInclusivePrefix();
-        }
-
-        // Rezip scatter items and segment indices
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            scatter_items[ITEM].key     = prev_keys[ITEM];
-            scatter_items[ITEM].value   = scan_items[ITEM].value;
-            segment_indices[ITEM]       = scan_items[ITEM].key;
-        }
-
-        // At this point, each flagged segment head has:
-        //  - The key for the previous segment
-        //  - The reduced value from the previous segment
-        //  - The segment index for the reduced value
-
-        // Scatter flagged keys and values
-        OffsetT num_tile_segments = block_aggregate.key;
-        Scatter(scatter_items, head_flags, segment_indices, num_tile_segments, num_segments_prefix);
-
-        // Last thread in last tile will output final count (and last pair, if necessary)
-        if ((IS_LAST_TILE) && (threadIdx.x == BLOCK_THREADS - 1))
-        {
-            OffsetT num_segments = num_segments_prefix + num_tile_segments;
-
-            // If the last tile is a whole tile, output the final_value
-            if (num_remaining == TILE_ITEMS)
-            {
-                d_unique_out[num_segments]      = keys[ITEMS_PER_THREAD - 1];
-                d_aggregates_out[num_segments]  = total_aggregate.value;
-                num_segments++;
-            }
-
-            // Output the total number of items selected
-            *d_num_runs_out = num_segments;
-        }
-    }
-
-
-    /**
-     * Scan tiles of items as part of a dynamic chained scan
-     */
-    __device__ __forceinline__ void ConsumeRange(
-        int                 num_items,          ///< Total number of input items
-        ScanTileStateT&     tile_state,         ///< Global tile state descriptor
-        int                 start_tile)         ///< The starting tile for the current grid
-    {
-        // Blocks are launched in increasing order, so just assign one tile per block
-        int     tile_idx        = start_tile + blockIdx.x;          // Current tile index
-        OffsetT tile_offset     = OffsetT(TILE_ITEMS) * tile_idx;   // Global offset for the current tile
-        OffsetT num_remaining   = num_items - tile_offset;          // Remaining items (including this tile)
-
-        if (num_remaining > TILE_ITEMS)
-        {
-            // Not last tile
-            ConsumeTile<false>(num_remaining, tile_idx, tile_offset, tile_state);
-        }
-        else if (num_remaining > 0)
-        {
-            // Last tile
-            ConsumeTile<true>(num_remaining, tile_idx, tile_offset, tile_state);
-        }
-    }
-
-};
-
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/agent/agent_rle.cuh b/thrust/system/cuda/detail/cub/agent/agent_rle.cuh
deleted file mode 100644
index 94f47eb5b..000000000
--- a/thrust/system/cuda/detail/cub/agent/agent_rle.cuh
+++ /dev/null
@@ -1,837 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::AgentRle implements a stateful abstraction of CUDA thread blocks for participating in device-wide run-length-encode.
- */
-
-#pragma once
-
-#include <iterator>
-
-#include "single_pass_scan_operators.cuh"
-#include "../block/block_load.cuh"
-#include "../block/block_store.cuh"
-#include "../block/block_scan.cuh"
-#include "../block/block_exchange.cuh"
-#include "../block/block_discontinuity.cuh"
-#include "../grid/grid_queue.cuh"
-#include "../iterator/cache_modified_input_iterator.cuh"
-#include "../iterator/constant_input_iterator.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Tuning policy types
- ******************************************************************************/
-
-/**
- * Parameterizable tuning policy type for AgentRle
- */
-template <
-    int                         _BLOCK_THREADS,                 ///< Threads per thread block
-    int                         _ITEMS_PER_THREAD,              ///< Items per thread (per tile of input)
-    BlockLoadAlgorithm          _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
-    CacheLoadModifier           _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
-    bool                        _STORE_WARP_TIME_SLICING,       ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage)
-    BlockScanAlgorithm          _SCAN_ALGORITHM>                ///< The BlockScan algorithm to use
-struct AgentRlePolicy
-{
-    enum
-    {
-        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
-        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
-        STORE_WARP_TIME_SLICING = _STORE_WARP_TIME_SLICING,     ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage)
-    };
-
-    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;      ///< The BlockLoad algorithm to use
-    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
-    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;      ///< The BlockScan algorithm to use
-};
-
-
-
-
-
-/******************************************************************************
- * Thread block abstractions
- ******************************************************************************/
-
-/**
- * \brief AgentRle implements a stateful abstraction of CUDA thread blocks for participating in device-wide run-length-encode 
- */
-template <
-    typename    AgentRlePolicyT,        ///< Parameterized AgentRlePolicyT tuning policy type
-    typename    InputIteratorT,         ///< Random-access input iterator type for data
-    typename    OffsetsOutputIteratorT, ///< Random-access output iterator type for offset values
-    typename    LengthsOutputIteratorT, ///< Random-access output iterator type for length values
-    typename    EqualityOpT,            ///< T equality operator type
-    typename    OffsetT>                ///< Signed integer type for global offsets
-struct AgentRle
-{
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    /// The input value type
-    typedef typename std::iterator_traits<InputIteratorT>::value_type T;
-
-    /// The lengths output value type
-    typedef typename If<(Equals<typename std::iterator_traits<LengthsOutputIteratorT>::value_type, void>::VALUE),   // LengthT =  (if output iterator's value type is void) ?
-        OffsetT,                                                                                                    // ... then the OffsetT type,
-        typename std::iterator_traits<LengthsOutputIteratorT>::value_type>::Type LengthT;                           // ... else the output iterator's value type
-
-    /// Tuple type for scanning (pairs run-length and run-index)
-    typedef KeyValuePair<OffsetT, LengthT> LengthOffsetPair;
-
-    /// Tile status descriptor interface type
-    typedef ReduceByKeyScanTileState<LengthT, OffsetT> ScanTileStateT;
-
-    // Constants
-    enum
-    {
-        WARP_THREADS            = CUB_WARP_THREADS(PTX_ARCH),
-        BLOCK_THREADS           = AgentRlePolicyT::BLOCK_THREADS,
-        ITEMS_PER_THREAD        = AgentRlePolicyT::ITEMS_PER_THREAD,
-        WARP_ITEMS              = WARP_THREADS * ITEMS_PER_THREAD,
-        TILE_ITEMS              = BLOCK_THREADS * ITEMS_PER_THREAD,
-        WARPS                   = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
-
-        /// Whether or not to sync after loading data
-        SYNC_AFTER_LOAD         = (AgentRlePolicyT::LOAD_ALGORITHM != BLOCK_LOAD_DIRECT),
-
-        /// Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage)
-        STORE_WARP_TIME_SLICING = AgentRlePolicyT::STORE_WARP_TIME_SLICING,
-        ACTIVE_EXCHANGE_WARPS   = (STORE_WARP_TIME_SLICING) ? 1 : WARPS,
-    };
-
-
-    /**
-     * Special operator that signals all out-of-bounds items are not equal to everything else,
-     * forcing both (1) the last item to be tail-flagged and (2) all oob items to be marked
-     * trivial.
-     */
-    template <bool LAST_TILE>
-    struct OobInequalityOp
-    {
-        OffsetT         num_remaining;
-        EqualityOpT      equality_op;
-
-        __device__ __forceinline__ OobInequalityOp(
-            OffsetT     num_remaining,
-            EqualityOpT  equality_op)
-        :
-            num_remaining(num_remaining),
-            equality_op(equality_op)
-        {}
-
-        template <typename Index>
-        __host__ __device__ __forceinline__ bool operator()(T first, T second, Index idx)
-        {
-            if (!LAST_TILE || (idx < num_remaining))
-                return !equality_op(first, second);
-            else
-                return true;
-        }
-    };
-
-
-    // Cache-modified Input iterator wrapper type (for applying cache modifier) for data
-    typedef typename If<IsPointer<InputIteratorT>::VALUE,
-            CacheModifiedInputIterator<AgentRlePolicyT::LOAD_MODIFIER, T, OffsetT>,      // Wrap the native input pointer with CacheModifiedVLengthnputIterator
-            InputIteratorT>::Type                                                       // Directly use the supplied input iterator type
-        WrappedInputIteratorT;
-
-    // Parameterized BlockLoad type for data
-    typedef BlockLoad<
-            T,
-            AgentRlePolicyT::BLOCK_THREADS,
-            AgentRlePolicyT::ITEMS_PER_THREAD,
-            AgentRlePolicyT::LOAD_ALGORITHM>
-        BlockLoadT;
-
-    // Parameterized BlockDiscontinuity type for data
-    typedef BlockDiscontinuity<T, BLOCK_THREADS> BlockDiscontinuityT;
-
-    // Parameterized WarpScan type
-    typedef WarpScan<LengthOffsetPair> WarpScanPairs;
-
-    // Reduce-length-by-run scan operator
-    typedef ReduceBySegmentOp<cub::Sum> ReduceBySegmentOpT;
-
-    // Callback type for obtaining tile prefix during block scan
-    typedef TilePrefixCallbackOp<
-            LengthOffsetPair,
-            ReduceBySegmentOpT,
-            ScanTileStateT>
-        TilePrefixCallbackOpT;
-
-    // Warp exchange types
-    typedef WarpExchange<LengthOffsetPair, ITEMS_PER_THREAD>        WarpExchangePairs;
-
-    typedef typename If<STORE_WARP_TIME_SLICING, typename WarpExchangePairs::TempStorage, NullType>::Type WarpExchangePairsStorage;
-
-    typedef WarpExchange<OffsetT, ITEMS_PER_THREAD>                 WarpExchangeOffsets;
-    typedef WarpExchange<LengthT, ITEMS_PER_THREAD>                 WarpExchangeLengths;
-
-    typedef LengthOffsetPair WarpAggregates[WARPS];
-
-    // Shared memory type for this thread block
-    struct _TempStorage
-    {
-        // Aliasable storage layout
-        union Aliasable
-        {
-            struct
-            {
-                typename BlockDiscontinuityT::TempStorage       discontinuity;              // Smem needed for discontinuity detection
-                typename WarpScanPairs::TempStorage             warp_scan[WARPS];           // Smem needed for warp-synchronous scans
-                Uninitialized<LengthOffsetPair[WARPS]>          warp_aggregates;            // Smem needed for sharing warp-wide aggregates
-                typename TilePrefixCallbackOpT::TempStorage     prefix;                     // Smem needed for cooperative prefix callback
-            };
-
-            // Smem needed for input loading
-            typename BlockLoadT::TempStorage                    load;
-
-            // Aliasable layout needed for two-phase scatter
-            union ScatterAliasable
-            {
-                unsigned long long                              align;
-                WarpExchangePairsStorage                        exchange_pairs[ACTIVE_EXCHANGE_WARPS];
-                typename WarpExchangeOffsets::TempStorage       exchange_offsets[ACTIVE_EXCHANGE_WARPS];
-                typename WarpExchangeLengths::TempStorage       exchange_lengths[ACTIVE_EXCHANGE_WARPS];
-
-            } scatter_aliasable;
-
-        } aliasable;
-
-        OffsetT             tile_idx;                   // Shared tile index
-        LengthOffsetPair    tile_inclusive;             // Inclusive tile prefix
-        LengthOffsetPair    tile_exclusive;             // Exclusive tile prefix
-    };
-
-    // Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    //---------------------------------------------------------------------
-    // Per-thread fields
-    //---------------------------------------------------------------------
-
-    _TempStorage&                   temp_storage;       ///< Reference to temp_storage
-
-    WrappedInputIteratorT           d_in;               ///< Pointer to input sequence of data items
-    OffsetsOutputIteratorT          d_offsets_out;      ///< Input run offsets
-    LengthsOutputIteratorT          d_lengths_out;      ///< Output run lengths
-
-    EqualityOpT                     equality_op;        ///< T equality operator
-    ReduceBySegmentOpT              scan_op;            ///< Reduce-length-by-flag scan operator
-    OffsetT                         num_items;          ///< Total number of input items
-
-
-    //---------------------------------------------------------------------
-    // Constructor
-    //---------------------------------------------------------------------
-
-    // Constructor
-    __device__ __forceinline__
-    AgentRle(
-        TempStorage                 &temp_storage,      ///< [in] Reference to temp_storage
-        InputIteratorT              d_in,               ///< [in] Pointer to input sequence of data items
-        OffsetsOutputIteratorT      d_offsets_out,      ///< [out] Pointer to output sequence of run offsets
-        LengthsOutputIteratorT      d_lengths_out,      ///< [out] Pointer to output sequence of run lengths
-        EqualityOpT                 equality_op,        ///< [in] T equality operator
-        OffsetT                     num_items)          ///< [in] Total number of input items
-    :
-        temp_storage(temp_storage.Alias()),
-        d_in(d_in),
-        d_offsets_out(d_offsets_out),
-        d_lengths_out(d_lengths_out),
-        equality_op(equality_op),
-        scan_op(cub::Sum()),
-        num_items(num_items)
-    {}
-
-
-    //---------------------------------------------------------------------
-    // Utility methods for initializing the selections
-    //---------------------------------------------------------------------
-
-    template <bool FIRST_TILE, bool LAST_TILE>
-    __device__ __forceinline__ void InitializeSelections(
-        OffsetT             tile_offset,
-        OffsetT             num_remaining,
-        T                   (&items)[ITEMS_PER_THREAD],
-        LengthOffsetPair    (&lengths_and_num_runs)[ITEMS_PER_THREAD])
-    {
-        bool                head_flags[ITEMS_PER_THREAD];
-        bool                tail_flags[ITEMS_PER_THREAD];
-
-        OobInequalityOp<LAST_TILE> inequality_op(num_remaining, equality_op);
-
-        if (FIRST_TILE && LAST_TILE)
-        {
-            // First-and-last-tile always head-flags the first item and tail-flags the last item
-
-            BlockDiscontinuityT(temp_storage.aliasable.discontinuity).FlagHeadsAndTails(
-                head_flags, tail_flags, items, inequality_op);
-        }
-        else if (FIRST_TILE)
-        {
-            // First-tile always head-flags the first item
-
-            // Get the first item from the next tile
-            T tile_successor_item;
-            if (threadIdx.x == BLOCK_THREADS - 1)
-                tile_successor_item = d_in[tile_offset + TILE_ITEMS];
-
-            BlockDiscontinuityT(temp_storage.aliasable.discontinuity).FlagHeadsAndTails(
-                head_flags, tail_flags, tile_successor_item, items, inequality_op);
-        }
-        else if (LAST_TILE)
-        {
-            // Last-tile always flags the last item
-
-            // Get the last item from the previous tile
-            T tile_predecessor_item;
-            if (threadIdx.x == 0)
-                tile_predecessor_item = d_in[tile_offset - 1];
-
-            BlockDiscontinuityT(temp_storage.aliasable.discontinuity).FlagHeadsAndTails(
-                head_flags, tile_predecessor_item, tail_flags, items, inequality_op);
-        }
-        else
-        {
-            // Get the first item from the next tile
-            T tile_successor_item;
-            if (threadIdx.x == BLOCK_THREADS - 1)
-                tile_successor_item = d_in[tile_offset + TILE_ITEMS];
-
-            // Get the last item from the previous tile
-            T tile_predecessor_item;
-            if (threadIdx.x == 0)
-                tile_predecessor_item = d_in[tile_offset - 1];
-
-            BlockDiscontinuityT(temp_storage.aliasable.discontinuity).FlagHeadsAndTails(
-                head_flags, tile_predecessor_item, tail_flags, tile_successor_item, items, inequality_op);
-        }
-
-        // Zip counts and runs
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            lengths_and_num_runs[ITEM].key      = head_flags[ITEM] && (!tail_flags[ITEM]);
-            lengths_and_num_runs[ITEM].value    = ((!head_flags[ITEM]) || (!tail_flags[ITEM]));
-        }
-    }
-
-    //---------------------------------------------------------------------
-    // Scan utility methods
-    //---------------------------------------------------------------------
-
-    /**
-     * Scan of allocations
-     */
-    __device__ __forceinline__ void WarpScanAllocations(
-        LengthOffsetPair    &tile_aggregate,
-        LengthOffsetPair    &warp_aggregate,
-        LengthOffsetPair    &warp_exclusive_in_tile,
-        LengthOffsetPair    &thread_exclusive_in_warp,
-        LengthOffsetPair    (&lengths_and_num_runs)[ITEMS_PER_THREAD])
-    {
-        // Perform warpscans
-        unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
-        int lane_id = LaneId();
-
-        LengthOffsetPair identity;
-        identity.key = 0;
-        identity.value = 0;
-
-        LengthOffsetPair thread_inclusive;
-        LengthOffsetPair thread_aggregate = internal::ThreadReduce(lengths_and_num_runs, scan_op);
-        WarpScanPairs(temp_storage.aliasable.warp_scan[warp_id]).Scan(
-            thread_aggregate,
-            thread_inclusive,
-            thread_exclusive_in_warp,
-            identity,
-            scan_op);
-
-        // Last lane in each warp shares its warp-aggregate
-        if (lane_id == WARP_THREADS - 1)
-            temp_storage.aliasable.warp_aggregates.Alias()[warp_id] = thread_inclusive;
-
-        CTA_SYNC();
-
-        // Accumulate total selected and the warp-wide prefix
-        warp_exclusive_in_tile          = identity;
-        warp_aggregate                  = temp_storage.aliasable.warp_aggregates.Alias()[warp_id];
-        tile_aggregate                  = temp_storage.aliasable.warp_aggregates.Alias()[0];
-
-        #pragma unroll
-        for (int WARP = 1; WARP < WARPS; ++WARP)
-        {
-            if (warp_id == WARP)
-                warp_exclusive_in_tile = tile_aggregate;
-
-            tile_aggregate = scan_op(tile_aggregate, temp_storage.aliasable.warp_aggregates.Alias()[WARP]);
-        }
-    }
-
-
-    //---------------------------------------------------------------------
-    // Utility methods for scattering selections
-    //---------------------------------------------------------------------
-
-    /**
-     * Two-phase scatter, specialized for warp time-slicing
-     */
-    template <bool FIRST_TILE>
-    __device__ __forceinline__ void ScatterTwoPhase(
-        OffsetT             tile_num_runs_exclusive_in_global,
-        OffsetT             warp_num_runs_aggregate,
-        OffsetT             warp_num_runs_exclusive_in_tile,
-        OffsetT             (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD],
-        LengthOffsetPair    (&lengths_and_offsets)[ITEMS_PER_THREAD],
-        Int2Type<true>      is_warp_time_slice)
-    {
-        unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
-        int lane_id = LaneId();
-
-        // Locally compact items within the warp (first warp)
-        if (warp_id == 0)
-        {
-            WarpExchangePairs(temp_storage.aliasable.scatter_aliasable.exchange_pairs[0]).ScatterToStriped(
-                lengths_and_offsets, thread_num_runs_exclusive_in_warp);
-        }
-
-        // Locally compact items within the warp (remaining warps)
-        #pragma unroll
-        for (int SLICE = 1; SLICE < WARPS; ++SLICE)
-        {
-            CTA_SYNC();
-
-            if (warp_id == SLICE)
-            {
-                WarpExchangePairs(temp_storage.aliasable.scatter_aliasable.exchange_pairs[0]).ScatterToStriped(
-                    lengths_and_offsets, thread_num_runs_exclusive_in_warp);
-            }
-        }
-
-        // Global scatter
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            if ((ITEM * WARP_THREADS) < warp_num_runs_aggregate - lane_id)
-            {
-                OffsetT item_offset =
-                    tile_num_runs_exclusive_in_global +
-                    warp_num_runs_exclusive_in_tile +
-                    (ITEM * WARP_THREADS) + lane_id;
-
-                // Scatter offset
-                d_offsets_out[item_offset] = lengths_and_offsets[ITEM].key;
-
-                // Scatter length if not the first (global) length
-                if ((!FIRST_TILE) || (ITEM != 0) || (threadIdx.x > 0))
-                {
-                    d_lengths_out[item_offset - 1] = lengths_and_offsets[ITEM].value;
-                }
-            }
-        }
-    }
-
-
-    /**
-     * Two-phase scatter
-     */
-    template <bool FIRST_TILE>
-    __device__ __forceinline__ void ScatterTwoPhase(
-        OffsetT             tile_num_runs_exclusive_in_global,
-        OffsetT             warp_num_runs_aggregate,
-        OffsetT             warp_num_runs_exclusive_in_tile,
-        OffsetT             (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD],
-        LengthOffsetPair    (&lengths_and_offsets)[ITEMS_PER_THREAD],
-        Int2Type<false>     is_warp_time_slice)
-    {
-        unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
-        int lane_id = LaneId();
-
-        // Unzip
-        OffsetT run_offsets[ITEMS_PER_THREAD];
-        LengthT run_lengths[ITEMS_PER_THREAD];
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            run_offsets[ITEM] = lengths_and_offsets[ITEM].key;
-            run_lengths[ITEM] = lengths_and_offsets[ITEM].value;
-        }
-
-        WarpExchangeOffsets(temp_storage.aliasable.scatter_aliasable.exchange_offsets[warp_id]).ScatterToStriped(
-            run_offsets, thread_num_runs_exclusive_in_warp);
-
-        WARP_SYNC(0xffffffff);
-
-        WarpExchangeLengths(temp_storage.aliasable.scatter_aliasable.exchange_lengths[warp_id]).ScatterToStriped(
-            run_lengths, thread_num_runs_exclusive_in_warp);
-
-        // Global scatter
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            if ((ITEM * WARP_THREADS) + lane_id < warp_num_runs_aggregate)
-            {
-                OffsetT item_offset =
-                    tile_num_runs_exclusive_in_global +
-                    warp_num_runs_exclusive_in_tile +
-                    (ITEM * WARP_THREADS) + lane_id;
-
-                // Scatter offset
-                d_offsets_out[item_offset] = run_offsets[ITEM];
-
-                // Scatter length if not the first (global) length
-                if ((!FIRST_TILE) || (ITEM != 0) || (threadIdx.x > 0))
-                {
-                    d_lengths_out[item_offset - 1] = run_lengths[ITEM];
-                }
-            }
-        }
-    }
-
-
-    /**
-     * Direct scatter
-     */
-    template <bool FIRST_TILE>
-    __device__ __forceinline__ void ScatterDirect(
-        OffsetT             tile_num_runs_exclusive_in_global,
-        OffsetT             warp_num_runs_aggregate,
-        OffsetT             warp_num_runs_exclusive_in_tile,
-        OffsetT             (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD],
-        LengthOffsetPair    (&lengths_and_offsets)[ITEMS_PER_THREAD])
-    {
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            if (thread_num_runs_exclusive_in_warp[ITEM] < warp_num_runs_aggregate)
-            {
-                OffsetT item_offset =
-                    tile_num_runs_exclusive_in_global +
-                    warp_num_runs_exclusive_in_tile +
-                    thread_num_runs_exclusive_in_warp[ITEM];
-
-                // Scatter offset
-                d_offsets_out[item_offset] = lengths_and_offsets[ITEM].key;
-
-                // Scatter length if not the first (global) length
-                if (item_offset >= 1)
-                {
-                    d_lengths_out[item_offset - 1] = lengths_and_offsets[ITEM].value;
-                }
-            }
-        }
-    }
-
-
-    /**
-     * Scatter
-     */
-    template <bool FIRST_TILE>
-    __device__ __forceinline__ void Scatter(
-        OffsetT             tile_num_runs_aggregate,
-        OffsetT             tile_num_runs_exclusive_in_global,
-        OffsetT             warp_num_runs_aggregate,
-        OffsetT             warp_num_runs_exclusive_in_tile,
-        OffsetT             (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD],
-        LengthOffsetPair    (&lengths_and_offsets)[ITEMS_PER_THREAD])
-    {
-        if ((ITEMS_PER_THREAD == 1) || (tile_num_runs_aggregate < BLOCK_THREADS))
-        {
-            // Direct scatter if the warp has any items
-            if (warp_num_runs_aggregate)
-            {
-                ScatterDirect<FIRST_TILE>(
-                    tile_num_runs_exclusive_in_global,
-                    warp_num_runs_aggregate,
-                    warp_num_runs_exclusive_in_tile,
-                    thread_num_runs_exclusive_in_warp,
-                    lengths_and_offsets);
-            }
-        }
-        else
-        {
-            // Scatter two phase
-            ScatterTwoPhase<FIRST_TILE>(
-                tile_num_runs_exclusive_in_global,
-                warp_num_runs_aggregate,
-                warp_num_runs_exclusive_in_tile,
-                thread_num_runs_exclusive_in_warp,
-                lengths_and_offsets,
-                Int2Type<STORE_WARP_TIME_SLICING>());
-        }
-    }
-
-
-
-    //---------------------------------------------------------------------
-    // Cooperatively scan a device-wide sequence of tiles with other CTAs
-    //---------------------------------------------------------------------
-
-    /**
-     * Process a tile of input (dynamic chained scan)
-     */
-    template <
-        bool                LAST_TILE>
-    __device__ __forceinline__ LengthOffsetPair ConsumeTile(
-        OffsetT             num_items,          ///< Total number of global input items
-        OffsetT             num_remaining,      ///< Number of global input items remaining (including this tile)
-        int                 tile_idx,           ///< Tile index
-        OffsetT             tile_offset,        ///< Tile offset
-        ScanTileStateT      &tile_status)       ///< Global list of tile status
-    {
-        if (tile_idx == 0)
-        {
-            // First tile
-
-            // Load items
-            T items[ITEMS_PER_THREAD];
-            if (LAST_TILE)
-                BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items, num_remaining, T());
-            else
-                BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items);
-
-            if (SYNC_AFTER_LOAD)
-                CTA_SYNC();
-
-            // Set flags
-            LengthOffsetPair    lengths_and_num_runs[ITEMS_PER_THREAD];
-
-            InitializeSelections<true, LAST_TILE>(
-                tile_offset,
-                num_remaining,
-                items,
-                lengths_and_num_runs);
-
-            // Exclusive scan of lengths and runs
-            LengthOffsetPair tile_aggregate;
-            LengthOffsetPair warp_aggregate;
-            LengthOffsetPair warp_exclusive_in_tile;
-            LengthOffsetPair thread_exclusive_in_warp;
-
-            WarpScanAllocations(
-                tile_aggregate,
-                warp_aggregate,
-                warp_exclusive_in_tile,
-                thread_exclusive_in_warp,
-                lengths_and_num_runs);
-
-            // Update tile status if this is not the last tile
-            if (!LAST_TILE && (threadIdx.x == 0))
-                tile_status.SetInclusive(0, tile_aggregate);
-
-            // Update thread_exclusive_in_warp to fold in warp run-length
-            if (thread_exclusive_in_warp.key == 0)
-                thread_exclusive_in_warp.value += warp_exclusive_in_tile.value;
-
-            LengthOffsetPair    lengths_and_offsets[ITEMS_PER_THREAD];
-            OffsetT             thread_num_runs_exclusive_in_warp[ITEMS_PER_THREAD];
-            LengthOffsetPair    lengths_and_num_runs2[ITEMS_PER_THREAD];
-
-            // Downsweep scan through lengths_and_num_runs
-            internal::ThreadScanExclusive(lengths_and_num_runs, lengths_and_num_runs2, scan_op, thread_exclusive_in_warp);
-
-            // Zip
-
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-            {
-                lengths_and_offsets[ITEM].value         = lengths_and_num_runs2[ITEM].value;
-                lengths_and_offsets[ITEM].key        = tile_offset + (threadIdx.x * ITEMS_PER_THREAD) + ITEM;
-                thread_num_runs_exclusive_in_warp[ITEM] = (lengths_and_num_runs[ITEM].key) ?
-                                                                lengths_and_num_runs2[ITEM].key :         // keep
-                                                                WARP_THREADS * ITEMS_PER_THREAD;            // discard
-            }
-
-            OffsetT tile_num_runs_aggregate              = tile_aggregate.key;
-            OffsetT tile_num_runs_exclusive_in_global    = 0;
-            OffsetT warp_num_runs_aggregate              = warp_aggregate.key;
-            OffsetT warp_num_runs_exclusive_in_tile      = warp_exclusive_in_tile.key;
-
-            // Scatter
-            Scatter<true>(
-                tile_num_runs_aggregate,
-                tile_num_runs_exclusive_in_global,
-                warp_num_runs_aggregate,
-                warp_num_runs_exclusive_in_tile,
-                thread_num_runs_exclusive_in_warp,
-                lengths_and_offsets);
-
-            // Return running total (inclusive of this tile)
-            return tile_aggregate;
-        }
-        else
-        {
-            // Not first tile
-
-            // Load items
-            T items[ITEMS_PER_THREAD];
-            if (LAST_TILE)
-                BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items, num_remaining, T());
-            else
-                BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items);
-
-            if (SYNC_AFTER_LOAD)
-                CTA_SYNC();
-
-            // Set flags
-            LengthOffsetPair    lengths_and_num_runs[ITEMS_PER_THREAD];
-
-            InitializeSelections<false, LAST_TILE>(
-                tile_offset,
-                num_remaining,
-                items,
-                lengths_and_num_runs);
-
-            // Exclusive scan of lengths and runs
-            LengthOffsetPair tile_aggregate;
-            LengthOffsetPair warp_aggregate;
-            LengthOffsetPair warp_exclusive_in_tile;
-            LengthOffsetPair thread_exclusive_in_warp;
-
-            WarpScanAllocations(
-                tile_aggregate,
-                warp_aggregate,
-                warp_exclusive_in_tile,
-                thread_exclusive_in_warp,
-                lengths_and_num_runs);
-
-            // First warp computes tile prefix in lane 0
-            TilePrefixCallbackOpT prefix_op(tile_status, temp_storage.aliasable.prefix, Sum(), tile_idx);
-            unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
-            if (warp_id == 0)
-            {
-                prefix_op(tile_aggregate);
-                if (threadIdx.x == 0)
-                    temp_storage.tile_exclusive = prefix_op.exclusive_prefix;
-            }
-
-            CTA_SYNC();
-
-            LengthOffsetPair tile_exclusive_in_global = temp_storage.tile_exclusive;
-
-            // Update thread_exclusive_in_warp to fold in warp and tile run-lengths
-            LengthOffsetPair thread_exclusive = scan_op(tile_exclusive_in_global, warp_exclusive_in_tile);
-            if (thread_exclusive_in_warp.key == 0)
-                thread_exclusive_in_warp.value += thread_exclusive.value;
-
-            // Downsweep scan through lengths_and_num_runs
-            LengthOffsetPair    lengths_and_num_runs2[ITEMS_PER_THREAD];
-            LengthOffsetPair    lengths_and_offsets[ITEMS_PER_THREAD];
-            OffsetT             thread_num_runs_exclusive_in_warp[ITEMS_PER_THREAD];
-
-            internal::ThreadScanExclusive(lengths_and_num_runs, lengths_and_num_runs2, scan_op, thread_exclusive_in_warp);
-
-            // Zip
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-            {
-                lengths_and_offsets[ITEM].value         = lengths_and_num_runs2[ITEM].value;
-                lengths_and_offsets[ITEM].key        = tile_offset + (threadIdx.x * ITEMS_PER_THREAD) + ITEM;
-                thread_num_runs_exclusive_in_warp[ITEM] = (lengths_and_num_runs[ITEM].key) ?
-                                                                lengths_and_num_runs2[ITEM].key :         // keep
-                                                                WARP_THREADS * ITEMS_PER_THREAD;            // discard
-            }
-
-            OffsetT tile_num_runs_aggregate              = tile_aggregate.key;
-            OffsetT tile_num_runs_exclusive_in_global    = tile_exclusive_in_global.key;
-            OffsetT warp_num_runs_aggregate              = warp_aggregate.key;
-            OffsetT warp_num_runs_exclusive_in_tile      = warp_exclusive_in_tile.key;
-
-            // Scatter
-            Scatter<false>(
-                tile_num_runs_aggregate,
-                tile_num_runs_exclusive_in_global,
-                warp_num_runs_aggregate,
-                warp_num_runs_exclusive_in_tile,
-                thread_num_runs_exclusive_in_warp,
-                lengths_and_offsets);
-
-            // Return running total (inclusive of this tile)
-            return prefix_op.inclusive_prefix;
-        }
-    }
-
-
-    /**
-     * Scan tiles of items as part of a dynamic chained scan
-     */
-    template <typename NumRunsIteratorT>            ///< Output iterator type for recording number of items selected
-    __device__ __forceinline__ void ConsumeRange(
-        int                 num_tiles,              ///< Total number of input tiles
-        ScanTileStateT&     tile_status,            ///< Global list of tile status
-        NumRunsIteratorT    d_num_runs_out)         ///< Output pointer for total number of runs identified
-    {
-        // Blocks are launched in increasing order, so just assign one tile per block
-        int     tile_idx        = (blockIdx.x * gridDim.y) + blockIdx.y;    // Current tile index
-        OffsetT tile_offset     = tile_idx * TILE_ITEMS;                  // Global offset for the current tile
-        OffsetT num_remaining   = num_items - tile_offset;                  // Remaining items (including this tile)
-
-        if (tile_idx < num_tiles - 1)
-        {
-            // Not the last tile (full)
-            ConsumeTile<false>(num_items, num_remaining, tile_idx, tile_offset, tile_status);
-        }
-        else if (num_remaining > 0)
-        {
-            // The last tile (possibly partially-full)
-            LengthOffsetPair running_total = ConsumeTile<true>(num_items, num_remaining, tile_idx, tile_offset, tile_status);
-
-            if (threadIdx.x == 0)
-            {
-                // Output the total number of items selected
-                *d_num_runs_out = running_total.key;
-
-                // The inclusive prefix contains accumulated length reduction for the last run
-                if (running_total.key > 0)
-                    d_lengths_out[running_total.key - 1] = running_total.value;
-            }
-        }
-    }
-};
-
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/agent/agent_scan.cuh b/thrust/system/cuda/detail/cub/agent/agent_scan.cuh
deleted file mode 100644
index bd35b6932..000000000
--- a/thrust/system/cuda/detail/cub/agent/agent_scan.cuh
+++ /dev/null
@@ -1,471 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::AgentScan implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan .
- */
-
-#pragma once
-
-#include <iterator>
-
-#include "single_pass_scan_operators.cuh"
-#include "../block/block_load.cuh"
-#include "../block/block_store.cuh"
-#include "../block/block_scan.cuh"
-#include "../grid/grid_queue.cuh"
-#include "../iterator/cache_modified_input_iterator.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Tuning policy types
- ******************************************************************************/
-
-/**
- * Parameterizable tuning policy type for AgentScan
- */
-template <
-    int                         _BLOCK_THREADS,                 ///< Threads per thread block
-    int                         _ITEMS_PER_THREAD,              ///< Items per thread (per tile of input)
-    BlockLoadAlgorithm          _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
-    CacheLoadModifier           _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
-    BlockStoreAlgorithm         _STORE_ALGORITHM,               ///< The BlockStore algorithm to use
-    BlockScanAlgorithm          _SCAN_ALGORITHM>                ///< The BlockScan algorithm to use
-struct AgentScanPolicy
-{
-    enum
-    {
-        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
-        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
-    };
-
-    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;          ///< The BlockLoad algorithm to use
-    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;           ///< Cache load modifier for reading input elements
-    static const BlockStoreAlgorithm    STORE_ALGORITHM         = _STORE_ALGORITHM;         ///< The BlockStore algorithm to use
-    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;          ///< The BlockScan algorithm to use
-};
-
-
-
-
-/******************************************************************************
- * Thread block abstractions
- ******************************************************************************/
-
-/**
- * \brief AgentScan implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan .
- */
-template <
-    typename AgentScanPolicyT,      ///< Parameterized AgentScanPolicyT tuning policy type
-    typename InputIteratorT,        ///< Random-access input iterator type
-    typename OutputIteratorT,       ///< Random-access output iterator type
-    typename ScanOpT,               ///< Scan functor type
-    typename InitValueT,            ///< The init_value element for ScanOpT type (cub::NullType for inclusive scan)
-    typename OffsetT>               ///< Signed integer type for global offsets
-struct AgentScan
-{
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    // The input value type
-    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
-
-    // The output value type
-    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
-        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
-
-    // Tile status descriptor interface type
-    typedef ScanTileState<OutputT> ScanTileStateT;
-
-    // Input iterator wrapper type (for applying cache modifier)
-    typedef typename If<IsPointer<InputIteratorT>::VALUE,
-            CacheModifiedInputIterator<AgentScanPolicyT::LOAD_MODIFIER, InputT, OffsetT>,   // Wrap the native input pointer with CacheModifiedInputIterator
-            InputIteratorT>::Type                                                           // Directly use the supplied input iterator type
-        WrappedInputIteratorT;
-
-    // Constants
-    enum
-    {
-        IS_INCLUSIVE        = Equals<InitValueT, NullType>::VALUE,            // Inclusive scan if no init_value type is provided
-        BLOCK_THREADS       = AgentScanPolicyT::BLOCK_THREADS,
-        ITEMS_PER_THREAD    = AgentScanPolicyT::ITEMS_PER_THREAD,
-        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
-    };
-
-    // Parameterized BlockLoad type
-    typedef BlockLoad<
-            OutputT,
-            AgentScanPolicyT::BLOCK_THREADS,
-            AgentScanPolicyT::ITEMS_PER_THREAD,
-            AgentScanPolicyT::LOAD_ALGORITHM>
-        BlockLoadT;
-
-    // Parameterized BlockStore type
-    typedef BlockStore<
-            OutputT,
-            AgentScanPolicyT::BLOCK_THREADS,
-            AgentScanPolicyT::ITEMS_PER_THREAD,
-            AgentScanPolicyT::STORE_ALGORITHM>
-        BlockStoreT;
-
-    // Parameterized BlockScan type
-    typedef BlockScan<
-            OutputT,
-            AgentScanPolicyT::BLOCK_THREADS,
-            AgentScanPolicyT::SCAN_ALGORITHM>
-        BlockScanT;
-
-    // Callback type for obtaining tile prefix during block scan
-    typedef TilePrefixCallbackOp<
-            OutputT,
-            ScanOpT,
-            ScanTileStateT>
-        TilePrefixCallbackOpT;
-
-    // Stateful BlockScan prefix callback type for managing a running total while scanning consecutive tiles
-    typedef BlockScanRunningPrefixOp<
-            OutputT,
-            ScanOpT>
-        RunningPrefixCallbackOp;
-
-    // Shared memory type for this thread block
-    union _TempStorage
-    {
-        typename BlockLoadT::TempStorage    load;       // Smem needed for tile loading
-        typename BlockStoreT::TempStorage   store;      // Smem needed for tile storing
-
-        struct
-        {
-            typename TilePrefixCallbackOpT::TempStorage  prefix;     // Smem needed for cooperative prefix callback
-            typename BlockScanT::TempStorage             scan;       // Smem needed for tile scanning
-        };
-    };
-
-    // Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    //---------------------------------------------------------------------
-    // Per-thread fields
-    //---------------------------------------------------------------------
-
-    _TempStorage&               temp_storage;       ///< Reference to temp_storage
-    WrappedInputIteratorT       d_in;               ///< Input data
-    OutputIteratorT             d_out;              ///< Output data
-    ScanOpT                     scan_op;            ///< Binary scan operator
-    InitValueT                  init_value;         ///< The init_value element for ScanOpT
-
-
-    //---------------------------------------------------------------------
-    // Block scan utility methods
-    //---------------------------------------------------------------------
-
-    /**
-     * Exclusive scan specialization (first tile)
-     */
-    __device__ __forceinline__
-    void ScanTile(
-        OutputT             (&items)[ITEMS_PER_THREAD],
-        OutputT             init_value,
-        ScanOpT             scan_op,
-        OutputT             &block_aggregate,
-        Int2Type<false>     /*is_inclusive*/)
-    {
-        BlockScanT(temp_storage.scan).ExclusiveScan(items, items, init_value, scan_op, block_aggregate);
-        block_aggregate = scan_op(init_value, block_aggregate);
-    }
-
-
-    /**
-     * Inclusive scan specialization (first tile)
-     */
-    __device__ __forceinline__
-    void ScanTile(
-        OutputT             (&items)[ITEMS_PER_THREAD],
-        InitValueT          /*init_value*/,
-        ScanOpT             scan_op,
-        OutputT             &block_aggregate,
-        Int2Type<true>      /*is_inclusive*/)
-    {
-        BlockScanT(temp_storage.scan).InclusiveScan(items, items, scan_op, block_aggregate);
-    }
-
-
-    /**
-     * Exclusive scan specialization (subsequent tiles)
-     */
-    template <typename PrefixCallback>
-    __device__ __forceinline__
-    void ScanTile(
-        OutputT             (&items)[ITEMS_PER_THREAD],
-        ScanOpT             scan_op,
-        PrefixCallback      &prefix_op,
-        Int2Type<false>     /*is_inclusive*/)
-    {
-        BlockScanT(temp_storage.scan).ExclusiveScan(items, items, scan_op, prefix_op);
-    }
-
-
-    /**
-     * Inclusive scan specialization (subsequent tiles)
-     */
-    template <typename PrefixCallback>
-    __device__ __forceinline__
-    void ScanTile(
-        OutputT             (&items)[ITEMS_PER_THREAD],
-        ScanOpT             scan_op,
-        PrefixCallback      &prefix_op,
-        Int2Type<true>      /*is_inclusive*/)
-    {
-        BlockScanT(temp_storage.scan).InclusiveScan(items, items, scan_op, prefix_op);
-    }
-
-
-    //---------------------------------------------------------------------
-    // Constructor
-    //---------------------------------------------------------------------
-
-    // Constructor
-    __device__ __forceinline__
-    AgentScan(
-        TempStorage&    temp_storage,       ///< Reference to temp_storage
-        InputIteratorT  d_in,               ///< Input data
-        OutputIteratorT d_out,              ///< Output data
-        ScanOpT         scan_op,            ///< Binary scan operator
-        InitValueT      init_value)         ///< Initial value to seed the exclusive scan
-    :
-        temp_storage(temp_storage.Alias()),
-        d_in(d_in),
-        d_out(d_out),
-        scan_op(scan_op),
-        init_value(init_value)
-    {}
-
-
-    //---------------------------------------------------------------------
-    // Cooperatively scan a device-wide sequence of tiles with other CTAs
-    //---------------------------------------------------------------------
-
-    /**
-     * Process a tile of input (dynamic chained scan)
-     */
-    template <bool IS_LAST_TILE>                ///< Whether the current tile is the last tile
-    __device__ __forceinline__ void ConsumeTile(
-        OffsetT             num_remaining,      ///< Number of global input items remaining (including this tile)
-        int                 tile_idx,           ///< Tile index
-        OffsetT             tile_offset,        ///< Tile offset
-        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
-    {
-        // Load items
-        OutputT items[ITEMS_PER_THREAD];
-
-        if (IS_LAST_TILE)
-            BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items, num_remaining);
-        else
-            BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items);
-
-        CTA_SYNC();
-
-        // Perform tile scan
-        if (tile_idx == 0)
-        {
-            // Scan first tile
-            OutputT block_aggregate;
-            ScanTile(items, init_value, scan_op, block_aggregate, Int2Type<IS_INCLUSIVE>());
-            if ((!IS_LAST_TILE) && (threadIdx.x == 0))
-                tile_state.SetInclusive(0, block_aggregate);
-        }
-        else
-        {
-            // Scan non-first tile
-            TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, scan_op, tile_idx);
-            ScanTile(items, scan_op, prefix_op, Int2Type<IS_INCLUSIVE>());
-        }
-
-        CTA_SYNC();
-
-        // Store items
-        if (IS_LAST_TILE)
-            BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items, num_remaining);
-        else
-            BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items);
-    }
-
-
-    /**
-     * Scan tiles of items as part of a dynamic chained scan
-     */
-    __device__ __forceinline__ void ConsumeRange(
-        int                 num_items,          ///< Total number of input items
-        ScanTileStateT&     tile_state,         ///< Global tile state descriptor
-        int                 start_tile)         ///< The starting tile for the current grid
-    {
-        // Blocks are launched in increasing order, so just assign one tile per block
-        int     tile_idx        = start_tile + blockIdx.x;          // Current tile index
-        OffsetT tile_offset     = OffsetT(TILE_ITEMS) * tile_idx;   // Global offset for the current tile
-        OffsetT num_remaining   = num_items - tile_offset;          // Remaining items (including this tile)
-
-        if (num_remaining > TILE_ITEMS)
-        {
-            // Not last tile
-            ConsumeTile<false>(num_remaining, tile_idx, tile_offset, tile_state);
-        }
-        else if (num_remaining > 0)
-        {
-            // Last tile
-            ConsumeTile<true>(num_remaining, tile_idx, tile_offset, tile_state);
-        }
-    }
-
-
-    //---------------------------------------------------------------------
-    // Scan an sequence of consecutive tiles (independent of other thread blocks)
-    //---------------------------------------------------------------------
-
-    /**
-     * Process a tile of input
-     */
-    template <
-        bool                        IS_FIRST_TILE,
-        bool                        IS_LAST_TILE>
-    __device__ __forceinline__ void ConsumeTile(
-        OffsetT                     tile_offset,                ///< Tile offset
-        RunningPrefixCallbackOp&    prefix_op,                  ///< Running prefix operator
-        int                         valid_items = TILE_ITEMS)   ///< Number of valid items in the tile
-    {
-        // Load items
-        OutputT items[ITEMS_PER_THREAD];
-
-        if (IS_LAST_TILE)
-            BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items, valid_items);
-        else
-            BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items);
-
-        CTA_SYNC();
-
-        // Block scan
-        if (IS_FIRST_TILE)
-        {
-            OutputT block_aggregate;
-            ScanTile(items, init_value, scan_op, block_aggregate, Int2Type<IS_INCLUSIVE>());
-            prefix_op.running_total = block_aggregate;
-        }
-        else
-        {
-            ScanTile(items, scan_op, prefix_op, Int2Type<IS_INCLUSIVE>());
-        }
-
-        CTA_SYNC();
-
-        // Store items
-        if (IS_LAST_TILE)
-            BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items, valid_items);
-        else
-            BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items);
-    }
-
-
-    /**
-     * Scan a consecutive share of input tiles
-     */
-    __device__ __forceinline__ void ConsumeRange(
-        OffsetT  range_offset,      ///< [in] Threadblock begin offset (inclusive)
-        OffsetT  range_end)         ///< [in] Threadblock end offset (exclusive)
-    {
-        BlockScanRunningPrefixOp<OutputT, ScanOpT> prefix_op(scan_op);
-
-        if (range_offset + TILE_ITEMS <= range_end)
-        {
-            // Consume first tile of input (full)
-            ConsumeTile<true, true>(range_offset, prefix_op);
-            range_offset += TILE_ITEMS;
-
-            // Consume subsequent full tiles of input
-            while (range_offset + TILE_ITEMS <= range_end)
-            {
-                ConsumeTile<false, true>(range_offset, prefix_op);
-                range_offset += TILE_ITEMS;
-            }
-
-            // Consume a partially-full tile
-            if (range_offset < range_end)
-            {
-                int valid_items = range_end - range_offset;
-                ConsumeTile<false, false>(range_offset, prefix_op, valid_items);
-            }
-        }
-        else
-        {
-            // Consume the first tile of input (partially-full)
-            int valid_items = range_end - range_offset;
-            ConsumeTile<true, false>(range_offset, prefix_op, valid_items);
-        }
-    }
-
-
-    /**
-     * Scan a consecutive share of input tiles, seeded with the specified prefix value
-     */
-    __device__ __forceinline__ void ConsumeRange(
-        OffsetT range_offset,                       ///< [in] Threadblock begin offset (inclusive)
-        OffsetT range_end,                          ///< [in] Threadblock end offset (exclusive)
-        OutputT prefix)                             ///< [in] The prefix to apply to the scan segment
-    {
-        BlockScanRunningPrefixOp<OutputT, ScanOpT> prefix_op(prefix, scan_op);
-
-        // Consume full tiles of input
-        while (range_offset + TILE_ITEMS <= range_end)
-        {
-            ConsumeTile<true, false>(range_offset, prefix_op);
-            range_offset += TILE_ITEMS;
-        }
-
-        // Consume a partially-full tile
-        if (range_offset < range_end)
-        {
-            int valid_items = range_end - range_offset;
-            ConsumeTile<false, false>(range_offset, prefix_op, valid_items);
-        }
-    }
-
-};
-
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/agent/agent_segment_fixup.cuh b/thrust/system/cuda/detail/cub/agent/agent_segment_fixup.cuh
deleted file mode 100644
index dd5359b96..000000000
--- a/thrust/system/cuda/detail/cub/agent/agent_segment_fixup.cuh
+++ /dev/null
@@ -1,375 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::AgentSegmentFixup implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key.
- */
-
-#pragma once
-
-#include <iterator>
-
-#include "single_pass_scan_operators.cuh"
-#include "../block/block_load.cuh"
-#include "../block/block_store.cuh"
-#include "../block/block_scan.cuh"
-#include "../block/block_discontinuity.cuh"
-#include "../iterator/cache_modified_input_iterator.cuh"
-#include "../iterator/constant_input_iterator.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Tuning policy types
- ******************************************************************************/
-
-/**
- * Parameterizable tuning policy type for AgentSegmentFixup
- */
-template <
-    int                         _BLOCK_THREADS,                 ///< Threads per thread block
-    int                         _ITEMS_PER_THREAD,              ///< Items per thread (per tile of input)
-    BlockLoadAlgorithm          _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
-    CacheLoadModifier           _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
-    BlockScanAlgorithm          _SCAN_ALGORITHM>                ///< The BlockScan algorithm to use
-struct AgentSegmentFixupPolicy
-{
-    enum
-    {
-        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
-        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
-    };
-
-    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;      ///< The BlockLoad algorithm to use
-    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
-    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;      ///< The BlockScan algorithm to use
-};
-
-
-/******************************************************************************
- * Thread block abstractions
- ******************************************************************************/
-
-/**
- * \brief AgentSegmentFixup implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key
- */
-template <
-    typename    AgentSegmentFixupPolicyT,       ///< Parameterized AgentSegmentFixupPolicy tuning policy type
-    typename    PairsInputIteratorT,            ///< Random-access input iterator type for keys
-    typename    AggregatesOutputIteratorT,      ///< Random-access output iterator type for values
-    typename    EqualityOpT,                    ///< KeyT equality operator type
-    typename    ReductionOpT,                   ///< ValueT reduction operator type
-    typename    OffsetT>                        ///< Signed integer type for global offsets
-struct AgentSegmentFixup
-{
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    // Data type of key-value input iterator
-    typedef typename std::iterator_traits<PairsInputIteratorT>::value_type KeyValuePairT;
-
-    // Value type
-    typedef typename KeyValuePairT::Value ValueT;
-
-    // Tile status descriptor interface type
-    typedef ReduceByKeyScanTileState<ValueT, OffsetT> ScanTileStateT;
-
-    // Constants
-    enum
-    {
-        BLOCK_THREADS       = AgentSegmentFixupPolicyT::BLOCK_THREADS,
-        ITEMS_PER_THREAD    = AgentSegmentFixupPolicyT::ITEMS_PER_THREAD,
-        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
-
-        // Whether or not do fixup using RLE + global atomics
-        USE_ATOMIC_FIXUP    = (CUB_PTX_ARCH >= 350) && 
-                                (Equals<ValueT, float>::VALUE || 
-                                 Equals<ValueT, int>::VALUE ||
-                                 Equals<ValueT, unsigned int>::VALUE ||
-                                 Equals<ValueT, unsigned long long>::VALUE),
-
-        // Whether or not the scan operation has a zero-valued identity value (true if we're performing addition on a primitive type)
-        HAS_IDENTITY_ZERO   = (Equals<ReductionOpT, cub::Sum>::VALUE) && (Traits<ValueT>::PRIMITIVE),
-    };
-
-    // Cache-modified Input iterator wrapper type (for applying cache modifier) for keys
-    typedef typename If<IsPointer<PairsInputIteratorT>::VALUE,
-            CacheModifiedInputIterator<AgentSegmentFixupPolicyT::LOAD_MODIFIER, KeyValuePairT, OffsetT>,    // Wrap the native input pointer with CacheModifiedValuesInputIterator
-            PairsInputIteratorT>::Type                                                                      // Directly use the supplied input iterator type
-        WrappedPairsInputIteratorT;
-
-    // Cache-modified Input iterator wrapper type (for applying cache modifier) for fixup values
-    typedef typename If<IsPointer<AggregatesOutputIteratorT>::VALUE,
-            CacheModifiedInputIterator<AgentSegmentFixupPolicyT::LOAD_MODIFIER, ValueT, OffsetT>,    // Wrap the native input pointer with CacheModifiedValuesInputIterator
-            AggregatesOutputIteratorT>::Type                                                        // Directly use the supplied input iterator type
-        WrappedFixupInputIteratorT;
-
-    // Reduce-value-by-segment scan operator
-    typedef ReduceByKeyOp<cub::Sum> ReduceBySegmentOpT;
-
-    // Parameterized BlockLoad type for pairs
-    typedef BlockLoad<
-            KeyValuePairT,
-            BLOCK_THREADS,
-            ITEMS_PER_THREAD,
-            AgentSegmentFixupPolicyT::LOAD_ALGORITHM>
-        BlockLoadPairs;
-
-    // Parameterized BlockScan type
-    typedef BlockScan<
-            KeyValuePairT,
-            BLOCK_THREADS,
-            AgentSegmentFixupPolicyT::SCAN_ALGORITHM>
-        BlockScanT;
-
-    // Callback type for obtaining tile prefix during block scan
-    typedef TilePrefixCallbackOp<
-            KeyValuePairT,
-            ReduceBySegmentOpT,
-            ScanTileStateT>
-        TilePrefixCallbackOpT;
-
-    // Shared memory type for this thread block
-    union _TempStorage
-    {
-        struct
-        {
-            typename BlockScanT::TempStorage                scan;           // Smem needed for tile scanning
-            typename TilePrefixCallbackOpT::TempStorage     prefix;         // Smem needed for cooperative prefix callback
-        };
-
-        // Smem needed for loading keys
-        typename BlockLoadPairs::TempStorage load_pairs;
-    };
-
-    // Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    //---------------------------------------------------------------------
-    // Per-thread fields
-    //---------------------------------------------------------------------
-
-    _TempStorage&                   temp_storage;       ///< Reference to temp_storage
-    WrappedPairsInputIteratorT      d_pairs_in;          ///< Input keys
-    AggregatesOutputIteratorT       d_aggregates_out;   ///< Output value aggregates
-    WrappedFixupInputIteratorT      d_fixup_in;         ///< Fixup input values
-    InequalityWrapper<EqualityOpT>  inequality_op;      ///< KeyT inequality operator
-    ReductionOpT                    reduction_op;       ///< Reduction operator
-    ReduceBySegmentOpT              scan_op;            ///< Reduce-by-segment scan operator
-
-
-    //---------------------------------------------------------------------
-    // Constructor
-    //---------------------------------------------------------------------
-
-    // Constructor
-    __device__ __forceinline__
-    AgentSegmentFixup(
-        TempStorage&                temp_storage,       ///< Reference to temp_storage
-        PairsInputIteratorT         d_pairs_in,          ///< Input keys
-        AggregatesOutputIteratorT   d_aggregates_out,   ///< Output value aggregates
-        EqualityOpT                 equality_op,        ///< KeyT equality operator
-        ReductionOpT                reduction_op)       ///< ValueT reduction operator
-    :
-        temp_storage(temp_storage.Alias()),
-        d_pairs_in(d_pairs_in),
-        d_aggregates_out(d_aggregates_out),
-        d_fixup_in(d_aggregates_out),
-        inequality_op(equality_op),
-        reduction_op(reduction_op),
-        scan_op(reduction_op)
-    {}
-
-
-    //---------------------------------------------------------------------
-    // Cooperatively scan a device-wide sequence of tiles with other CTAs
-    //---------------------------------------------------------------------
-
-
-    /**
-     * Process input tile.  Specialized for atomic-fixup
-     */
-    template <bool IS_LAST_TILE>
-    __device__ __forceinline__ void ConsumeTile(
-        OffsetT             num_remaining,      ///< Number of global input items remaining (including this tile)
-        int                 tile_idx,           ///< Tile index
-        OffsetT             tile_offset,        ///< Tile offset
-        ScanTileStateT&     tile_state,         ///< Global tile state descriptor
-        Int2Type<true>      use_atomic_fixup)   ///< Marker whether to use atomicAdd (instead of reduce-by-key)
-    {
-        KeyValuePairT   pairs[ITEMS_PER_THREAD];
-
-        // Load pairs
-        KeyValuePairT oob_pair;
-        oob_pair.key = -1;
-
-        if (IS_LAST_TILE)
-            BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs, num_remaining, oob_pair);
-        else
-            BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs);
-
-        // RLE 
-        #pragma unroll
-        for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            ValueT* d_scatter = d_aggregates_out + pairs[ITEM - 1].key;
-            if (pairs[ITEM].key != pairs[ITEM - 1].key)
-                atomicAdd(d_scatter, pairs[ITEM - 1].value);
-            else
-                pairs[ITEM].value = reduction_op(pairs[ITEM - 1].value, pairs[ITEM].value);
-        }
-
-        // Flush last item if valid
-        ValueT* d_scatter = d_aggregates_out + pairs[ITEMS_PER_THREAD - 1].key;
-        if ((!IS_LAST_TILE) || (pairs[ITEMS_PER_THREAD - 1].key >= 0))
-            atomicAdd(d_scatter, pairs[ITEMS_PER_THREAD - 1].value);
-    }
-
-
-    /**
-     * Process input tile.  Specialized for reduce-by-key fixup
-     */
-    template <bool IS_LAST_TILE>
-    __device__ __forceinline__ void ConsumeTile(
-        OffsetT             num_remaining,      ///< Number of global input items remaining (including this tile)
-        int                 tile_idx,           ///< Tile index
-        OffsetT             tile_offset,        ///< Tile offset
-        ScanTileStateT&     tile_state,         ///< Global tile state descriptor
-        Int2Type<false>     use_atomic_fixup)   ///< Marker whether to use atomicAdd (instead of reduce-by-key)
-    {
-        KeyValuePairT   pairs[ITEMS_PER_THREAD];
-        KeyValuePairT   scatter_pairs[ITEMS_PER_THREAD];
-
-        // Load pairs
-        KeyValuePairT oob_pair;
-        oob_pair.key = -1;
-
-        if (IS_LAST_TILE)
-            BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs, num_remaining, oob_pair);
-        else
-            BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs);
-
-        CTA_SYNC();
-
-        KeyValuePairT tile_aggregate;
-        if (tile_idx == 0)
-        {
-            // Exclusive scan of values and segment_flags
-            BlockScanT(temp_storage.scan).ExclusiveScan(pairs, scatter_pairs, scan_op, tile_aggregate);
-
-            // Update tile status if this is not the last tile
-            if (threadIdx.x == 0)
-            {
-                // Set first segment id to not trigger a flush (invalid from exclusive scan)
-                scatter_pairs[0].key = pairs[0].key;
-
-                if (!IS_LAST_TILE)
-                    tile_state.SetInclusive(0, tile_aggregate);
-
-            }
-        }
-        else
-        {
-            // Exclusive scan of values and segment_flags
-            TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, scan_op, tile_idx);
-            BlockScanT(temp_storage.scan).ExclusiveScan(pairs, scatter_pairs, scan_op, prefix_op);
-            tile_aggregate = prefix_op.GetBlockAggregate();
-        }
-
-        // Scatter updated values
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            if (scatter_pairs[ITEM].key != pairs[ITEM].key)
-            {
-                // Update the value at the key location
-                ValueT value    = d_fixup_in[scatter_pairs[ITEM].key];
-                value           = reduction_op(value, scatter_pairs[ITEM].value);
-
-                d_aggregates_out[scatter_pairs[ITEM].key] = value;
-            }
-        }
-
-        // Finalize the last item
-        if (IS_LAST_TILE)
-        {
-            // Last thread will output final count and last item, if necessary
-            if (threadIdx.x == BLOCK_THREADS - 1)
-            {
-                // If the last tile is a whole tile, the inclusive prefix contains accumulated value reduction for the last segment
-                if (num_remaining == TILE_ITEMS)
-                {
-                    // Update the value at the key location
-                    OffsetT last_key = pairs[ITEMS_PER_THREAD - 1].key;
-                    d_aggregates_out[last_key] = reduction_op(tile_aggregate.value, d_fixup_in[last_key]);
-                }
-            }
-        }
-    }
-
-
-    /**
-     * Scan tiles of items as part of a dynamic chained scan
-     */
-    __device__ __forceinline__ void ConsumeRange(
-        int                 num_items,          ///< Total number of input items
-        int                 num_tiles,          ///< Total number of input tiles
-        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
-    {
-        // Blocks are launched in increasing order, so just assign one tile per block
-        int     tile_idx        = (blockIdx.x * gridDim.y) + blockIdx.y;    // Current tile index
-        OffsetT tile_offset     = tile_idx * TILE_ITEMS;                    // Global offset for the current tile
-        OffsetT num_remaining   = num_items - tile_offset;                  // Remaining items (including this tile)
-
-        if (num_remaining > TILE_ITEMS)
-        {
-            // Not the last tile (full)
-            ConsumeTile<false>(num_remaining, tile_idx, tile_offset, tile_state, Int2Type<USE_ATOMIC_FIXUP>());
-        }
-        else if (num_remaining > 0)
-        {
-            // The last tile (possibly partially-full)
-            ConsumeTile<true>(num_remaining, tile_idx, tile_offset, tile_state, Int2Type<USE_ATOMIC_FIXUP>());
-        }
-    }
-
-};
-
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/agent/agent_select_if.cuh b/thrust/system/cuda/detail/cub/agent/agent_select_if.cuh
deleted file mode 100644
index 327e66530..000000000
--- a/thrust/system/cuda/detail/cub/agent/agent_select_if.cuh
+++ /dev/null
@@ -1,703 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::AgentSelectIf implements a stateful abstraction of CUDA thread blocks for participating in device-wide select.
- */
-
-#pragma once
-
-#include <iterator>
-
-#include "single_pass_scan_operators.cuh"
-#include "../block/block_load.cuh"
-#include "../block/block_store.cuh"
-#include "../block/block_scan.cuh"
-#include "../block/block_exchange.cuh"
-#include "../block/block_discontinuity.cuh"
-#include "../grid/grid_queue.cuh"
-#include "../iterator/cache_modified_input_iterator.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Tuning policy types
- ******************************************************************************/
-
-/**
- * Parameterizable tuning policy type for AgentSelectIf
- */
-template <
-    int                         _BLOCK_THREADS,                 ///< Threads per thread block
-    int                         _ITEMS_PER_THREAD,              ///< Items per thread (per tile of input)
-    BlockLoadAlgorithm          _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
-    CacheLoadModifier           _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
-    BlockScanAlgorithm          _SCAN_ALGORITHM>                ///< The BlockScan algorithm to use
-struct AgentSelectIfPolicy
-{
-    enum
-    {
-        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
-        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
-    };
-
-    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;      ///< The BlockLoad algorithm to use
-    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
-    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;      ///< The BlockScan algorithm to use
-};
-
-
-
-
-/******************************************************************************
- * Thread block abstractions
- ******************************************************************************/
-
-
-/**
- * \brief AgentSelectIf implements a stateful abstraction of CUDA thread blocks for participating in device-wide selection
- *
- * Performs functor-based selection if SelectOpT functor type != NullType
- * Otherwise performs flag-based selection if FlagsInputIterator's value type != NullType
- * Otherwise performs discontinuity selection (keep unique)
- */
-template <
-    typename    AgentSelectIfPolicyT,           ///< Parameterized AgentSelectIfPolicy tuning policy type
-    typename    InputIteratorT,                 ///< Random-access input iterator type for selection items
-    typename    FlagsInputIteratorT,            ///< Random-access input iterator type for selections (NullType* if a selection functor or discontinuity flagging is to be used for selection)
-    typename    SelectedOutputIteratorT,        ///< Random-access input iterator type for selection_flags items
-    typename    SelectOpT,                      ///< Selection operator type (NullType if selections or discontinuity flagging is to be used for selection)
-    typename    EqualityOpT,                    ///< Equality operator type (NullType if selection functor or selections is to be used for selection)
-    typename    OffsetT,                        ///< Signed integer type for global offsets
-    bool        KEEP_REJECTS>                   ///< Whether or not we push rejected items to the back of the output
-struct AgentSelectIf
-{
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    // The input value type
-    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
-
-    // The output value type
-    typedef typename If<(Equals<typename std::iterator_traits<SelectedOutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-        typename std::iterator_traits<InputIteratorT>::value_type,                                                  // ... then the input iterator's value type,
-        typename std::iterator_traits<SelectedOutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
-
-    // The flag value type
-    typedef typename std::iterator_traits<FlagsInputIteratorT>::value_type FlagT;
-
-    // Tile status descriptor interface type
-    typedef ScanTileState<OffsetT> ScanTileStateT;
-
-    // Constants
-    enum
-    {
-        USE_SELECT_OP,
-        USE_SELECT_FLAGS,
-        USE_DISCONTINUITY,
-
-        BLOCK_THREADS           = AgentSelectIfPolicyT::BLOCK_THREADS,
-        ITEMS_PER_THREAD        = AgentSelectIfPolicyT::ITEMS_PER_THREAD,
-        TILE_ITEMS              = BLOCK_THREADS * ITEMS_PER_THREAD,
-        TWO_PHASE_SCATTER       = (ITEMS_PER_THREAD > 1),
-
-        SELECT_METHOD           = (!Equals<SelectOpT, NullType>::VALUE) ?
-                                    USE_SELECT_OP :
-                                    (!Equals<FlagT, NullType>::VALUE) ?
-                                        USE_SELECT_FLAGS :
-                                        USE_DISCONTINUITY
-    };
-
-    // Cache-modified Input iterator wrapper type (for applying cache modifier) for items
-    typedef typename If<IsPointer<InputIteratorT>::VALUE,
-            CacheModifiedInputIterator<AgentSelectIfPolicyT::LOAD_MODIFIER, InputT, OffsetT>,        // Wrap the native input pointer with CacheModifiedValuesInputIterator
-            InputIteratorT>::Type                                                               // Directly use the supplied input iterator type
-        WrappedInputIteratorT;
-
-    // Cache-modified Input iterator wrapper type (for applying cache modifier) for values
-    typedef typename If<IsPointer<FlagsInputIteratorT>::VALUE,
-            CacheModifiedInputIterator<AgentSelectIfPolicyT::LOAD_MODIFIER, FlagT, OffsetT>,    // Wrap the native input pointer with CacheModifiedValuesInputIterator
-            FlagsInputIteratorT>::Type                                                          // Directly use the supplied input iterator type
-        WrappedFlagsInputIteratorT;
-
-    // Parameterized BlockLoad type for input data
-    typedef BlockLoad<
-            OutputT,
-            BLOCK_THREADS,
-            ITEMS_PER_THREAD,
-            AgentSelectIfPolicyT::LOAD_ALGORITHM>
-        BlockLoadT;
-
-    // Parameterized BlockLoad type for flags
-    typedef BlockLoad<
-            FlagT,
-            BLOCK_THREADS,
-            ITEMS_PER_THREAD,
-            AgentSelectIfPolicyT::LOAD_ALGORITHM>
-        BlockLoadFlags;
-
-    // Parameterized BlockDiscontinuity type for items
-    typedef BlockDiscontinuity<
-            OutputT,
-            BLOCK_THREADS>
-        BlockDiscontinuityT;
-
-    // Parameterized BlockScan type
-    typedef BlockScan<
-            OffsetT,
-            BLOCK_THREADS,
-            AgentSelectIfPolicyT::SCAN_ALGORITHM>
-        BlockScanT;
-
-    // Callback type for obtaining tile prefix during block scan
-    typedef TilePrefixCallbackOp<
-            OffsetT,
-            cub::Sum,
-            ScanTileStateT>
-        TilePrefixCallbackOpT;
-
-    // Item exchange type
-    typedef OutputT ItemExchangeT[TILE_ITEMS];
-
-    // Shared memory type for this thread block
-    union _TempStorage
-    {
-        struct
-        {
-            typename BlockScanT::TempStorage                scan;           // Smem needed for tile scanning
-            typename TilePrefixCallbackOpT::TempStorage     prefix;         // Smem needed for cooperative prefix callback
-            typename BlockDiscontinuityT::TempStorage       discontinuity;  // Smem needed for discontinuity detection
-        };
-
-        // Smem needed for loading items
-        typename BlockLoadT::TempStorage load_items;
-
-        // Smem needed for loading values
-        typename BlockLoadFlags::TempStorage load_flags;
-
-        // Smem needed for compacting items (allows non POD items in this union)
-        Uninitialized<ItemExchangeT> raw_exchange;
-    };
-
-    // Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    //---------------------------------------------------------------------
-    // Per-thread fields
-    //---------------------------------------------------------------------
-
-    _TempStorage&                   temp_storage;       ///< Reference to temp_storage
-    WrappedInputIteratorT           d_in;               ///< Input items
-    SelectedOutputIteratorT         d_selected_out;     ///< Unique output items
-    WrappedFlagsInputIteratorT      d_flags_in;         ///< Input selection flags (if applicable)
-    InequalityWrapper<EqualityOpT>  inequality_op;      ///< T inequality operator
-    SelectOpT                       select_op;          ///< Selection operator
-    OffsetT                         num_items;          ///< Total number of input items
-
-
-    //---------------------------------------------------------------------
-    // Constructor
-    //---------------------------------------------------------------------
-
-    // Constructor
-    __device__ __forceinline__
-    AgentSelectIf(
-        TempStorage                 &temp_storage,      ///< Reference to temp_storage
-        InputIteratorT              d_in,               ///< Input data
-        FlagsInputIteratorT         d_flags_in,         ///< Input selection flags (if applicable)
-        SelectedOutputIteratorT     d_selected_out,     ///< Output data
-        SelectOpT                   select_op,          ///< Selection operator
-        EqualityOpT                 equality_op,        ///< Equality operator
-        OffsetT                     num_items)          ///< Total number of input items
-    :
-        temp_storage(temp_storage.Alias()),
-        d_in(d_in),
-        d_flags_in(d_flags_in),
-        d_selected_out(d_selected_out),
-        select_op(select_op),
-        inequality_op(equality_op),
-        num_items(num_items)
-    {}
-
-
-    //---------------------------------------------------------------------
-    // Utility methods for initializing the selections
-    //---------------------------------------------------------------------
-
-    /**
-     * Initialize selections (specialized for selection operator)
-     */
-    template <bool IS_FIRST_TILE, bool IS_LAST_TILE>
-    __device__ __forceinline__ void InitializeSelections(
-        OffsetT                     /*tile_offset*/,
-        OffsetT                     num_tile_items,
-        OutputT                     (&items)[ITEMS_PER_THREAD],
-        OffsetT                     (&selection_flags)[ITEMS_PER_THREAD],
-        Int2Type<USE_SELECT_OP>     /*select_method*/)
-    {
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            // Out-of-bounds items are selection_flags
-            selection_flags[ITEM] = 1;
-
-            if (!IS_LAST_TILE || (OffsetT(threadIdx.x * ITEMS_PER_THREAD) + ITEM < num_tile_items))
-                selection_flags[ITEM] = select_op(items[ITEM]);
-        }
-    }
-
-
-    /**
-     * Initialize selections (specialized for valid flags)
-     */
-    template <bool IS_FIRST_TILE, bool IS_LAST_TILE>
-    __device__ __forceinline__ void InitializeSelections(
-        OffsetT                     tile_offset,
-        OffsetT                     num_tile_items,
-        OutputT                     (&/*items*/)[ITEMS_PER_THREAD],
-        OffsetT                     (&selection_flags)[ITEMS_PER_THREAD],
-        Int2Type<USE_SELECT_FLAGS>  /*select_method*/)
-    {
-        CTA_SYNC();
-
-        FlagT flags[ITEMS_PER_THREAD];
-
-        if (IS_LAST_TILE)
-        {
-            // Out-of-bounds items are selection_flags
-            BlockLoadFlags(temp_storage.load_flags).Load(d_flags_in + tile_offset, flags, num_tile_items, 1);
-        }
-        else
-        {
-            BlockLoadFlags(temp_storage.load_flags).Load(d_flags_in + tile_offset, flags);
-        }
-
-        // Convert flag type to selection_flags type
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            selection_flags[ITEM] = flags[ITEM];
-        }
-    }
-
-
-    /**
-     * Initialize selections (specialized for discontinuity detection)
-     */
-    template <bool IS_FIRST_TILE, bool IS_LAST_TILE>
-    __device__ __forceinline__ void InitializeSelections(
-        OffsetT                     tile_offset,
-        OffsetT                     num_tile_items,
-        OutputT                     (&items)[ITEMS_PER_THREAD],
-        OffsetT                     (&selection_flags)[ITEMS_PER_THREAD],
-        Int2Type<USE_DISCONTINUITY> /*select_method*/)
-    {
-        if (IS_FIRST_TILE)
-        {
-            CTA_SYNC();
-
-            // Set head selection_flags.  First tile sets the first flag for the first item
-            BlockDiscontinuityT(temp_storage.discontinuity).FlagHeads(selection_flags, items, inequality_op);
-        }
-        else
-        {
-            OutputT tile_predecessor;
-            if (threadIdx.x == 0)
-                tile_predecessor = d_in[tile_offset - 1];
-
-            CTA_SYNC();
-
-            BlockDiscontinuityT(temp_storage.discontinuity).FlagHeads(selection_flags, items, inequality_op, tile_predecessor);
-        }
-
-        // Set selection flags for out-of-bounds items
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            // Set selection_flags for out-of-bounds items
-            if ((IS_LAST_TILE) && (OffsetT(threadIdx.x * ITEMS_PER_THREAD) + ITEM >= num_tile_items))
-                selection_flags[ITEM] = 1;
-        }
-    }
-
-
-    //---------------------------------------------------------------------
-    // Scatter utility methods
-    //---------------------------------------------------------------------
-
-    /**
-     * Scatter flagged items to output offsets (specialized for direct scattering)
-     */
-    template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
-    __device__ __forceinline__ void ScatterDirect(
-        OutputT (&items)[ITEMS_PER_THREAD],
-        OffsetT (&selection_flags)[ITEMS_PER_THREAD],
-        OffsetT (&selection_indices)[ITEMS_PER_THREAD],
-        OffsetT num_selections)
-    {
-        // Scatter flagged items
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            if (selection_flags[ITEM])
-            {
-                if ((!IS_LAST_TILE) || selection_indices[ITEM] < num_selections)
-                {
-                    d_selected_out[selection_indices[ITEM]] = items[ITEM];
-                }
-            }
-        }
-    }
-
-
-    /**
-     * Scatter flagged items to output offsets (specialized for two-phase scattering)
-     */
-    template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
-    __device__ __forceinline__ void ScatterTwoPhase(
-        OutputT         (&items)[ITEMS_PER_THREAD],
-        OffsetT         (&selection_flags)[ITEMS_PER_THREAD],
-        OffsetT         (&selection_indices)[ITEMS_PER_THREAD],
-        int             /*num_tile_items*/,                         ///< Number of valid items in this tile
-        int             num_tile_selections,                        ///< Number of selections in this tile
-        OffsetT         num_selections_prefix,                      ///< Total number of selections prior to this tile
-        OffsetT         /*num_rejected_prefix*/,                    ///< Total number of rejections prior to this tile
-        Int2Type<false> /*is_keep_rejects*/)                        ///< Marker type indicating whether to keep rejected items in the second partition
-    {
-        CTA_SYNC();
-
-        // Compact and scatter items
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            int local_scatter_offset = selection_indices[ITEM] - num_selections_prefix;
-            if (selection_flags[ITEM])
-            {
-                temp_storage.raw_exchange.Alias()[local_scatter_offset] = items[ITEM];
-            }
-        }
-
-        CTA_SYNC();
-
-        for (int item = threadIdx.x; item < num_tile_selections; item += BLOCK_THREADS)
-        {
-            d_selected_out[num_selections_prefix + item] = temp_storage.raw_exchange.Alias()[item];
-        }
-    }
-
-
-    /**
-     * Scatter flagged items to output offsets (specialized for two-phase scattering)
-     */
-    template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
-    __device__ __forceinline__ void ScatterTwoPhase(
-        OutputT         (&items)[ITEMS_PER_THREAD],
-        OffsetT         (&selection_flags)[ITEMS_PER_THREAD],
-        OffsetT         (&selection_indices)[ITEMS_PER_THREAD],
-        int             num_tile_items,                             ///< Number of valid items in this tile
-        int             num_tile_selections,                        ///< Number of selections in this tile
-        OffsetT         num_selections_prefix,                      ///< Total number of selections prior to this tile
-        OffsetT         num_rejected_prefix,                        ///< Total number of rejections prior to this tile
-        Int2Type<true>  /*is_keep_rejects*/)                        ///< Marker type indicating whether to keep rejected items in the second partition
-    {
-        CTA_SYNC();
-
-        int tile_num_rejections = num_tile_items - num_tile_selections;
-
-        // Scatter items to shared memory (rejections first)
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            int item_idx                = (threadIdx.x * ITEMS_PER_THREAD) + ITEM;
-            int local_selection_idx     = selection_indices[ITEM] - num_selections_prefix;
-            int local_rejection_idx     = item_idx - local_selection_idx;
-            int local_scatter_offset    = (selection_flags[ITEM]) ?
-                                            tile_num_rejections + local_selection_idx :
-                                            local_rejection_idx;
-
-            temp_storage.raw_exchange.Alias()[local_scatter_offset] = items[ITEM];
-        }
-
-        CTA_SYNC();
-
-        // Gather items from shared memory and scatter to global
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            int item_idx            = (ITEM * BLOCK_THREADS) + threadIdx.x;
-            int rejection_idx       = item_idx;
-            int selection_idx       = item_idx - tile_num_rejections;
-            OffsetT scatter_offset  = (item_idx < tile_num_rejections) ?
-                                        num_items - num_rejected_prefix - rejection_idx - 1 :
-                                        num_selections_prefix + selection_idx;
-
-            OutputT item = temp_storage.raw_exchange.Alias()[item_idx];
-
-            if (!IS_LAST_TILE || (item_idx < num_tile_items))
-            {
-                d_selected_out[scatter_offset] = item;
-            }
-        }
-    }
-
-
-    /**
-     * Scatter flagged items
-     */
-    template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
-    __device__ __forceinline__ void Scatter(
-        OutputT         (&items)[ITEMS_PER_THREAD],
-        OffsetT         (&selection_flags)[ITEMS_PER_THREAD],
-        OffsetT         (&selection_indices)[ITEMS_PER_THREAD],
-        int             num_tile_items,                             ///< Number of valid items in this tile
-        int             num_tile_selections,                        ///< Number of selections in this tile
-        OffsetT         num_selections_prefix,                      ///< Total number of selections prior to this tile
-        OffsetT         num_rejected_prefix,                        ///< Total number of rejections prior to this tile
-        OffsetT         num_selections)                             ///< Total number of selections including this tile
-    {
-        // Do a two-phase scatter if (a) keeping both partitions or (b) two-phase is enabled and the average number of selection_flags items per thread is greater than one
-        if (KEEP_REJECTS || (TWO_PHASE_SCATTER && (num_tile_selections > BLOCK_THREADS)))
-        {
-            ScatterTwoPhase<IS_LAST_TILE, IS_FIRST_TILE>(
-                items,
-                selection_flags,
-                selection_indices,
-                num_tile_items,
-                num_tile_selections,
-                num_selections_prefix,
-                num_rejected_prefix,
-                Int2Type<KEEP_REJECTS>());
-        }
-        else
-        {
-            ScatterDirect<IS_LAST_TILE, IS_FIRST_TILE>(
-                items,
-                selection_flags,
-                selection_indices,
-                num_selections);
-        }
-    }
-
-    //---------------------------------------------------------------------
-    // Cooperatively scan a device-wide sequence of tiles with other CTAs
-    //---------------------------------------------------------------------
-
-
-    /**
-     * Process first tile of input (dynamic chained scan).  Returns the running count of selections (including this tile)
-     */
-    template <bool IS_LAST_TILE>
-    __device__ __forceinline__ OffsetT ConsumeFirstTile(
-        int                 num_tile_items,      ///< Number of input items comprising this tile
-        OffsetT             tile_offset,        ///< Tile offset
-        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
-    {
-        OutputT     items[ITEMS_PER_THREAD];
-        OffsetT     selection_flags[ITEMS_PER_THREAD];
-        OffsetT     selection_indices[ITEMS_PER_THREAD];
-
-        // Load items
-        if (IS_LAST_TILE)
-            BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items, num_tile_items);
-        else
-            BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items);
-
-        // Initialize selection_flags
-        InitializeSelections<true, IS_LAST_TILE>(
-            tile_offset,
-            num_tile_items,
-            items,
-            selection_flags,
-            Int2Type<SELECT_METHOD>());
-
-        CTA_SYNC();
-
-        // Exclusive scan of selection_flags
-        OffsetT num_tile_selections;
-        BlockScanT(temp_storage.scan).ExclusiveSum(selection_flags, selection_indices, num_tile_selections);
-
-        if (threadIdx.x == 0)
-        {
-            // Update tile status if this is not the last tile
-            if (!IS_LAST_TILE)
-                tile_state.SetInclusive(0, num_tile_selections);
-        }
-
-        // Discount any out-of-bounds selections
-        if (IS_LAST_TILE)
-            num_tile_selections -= (TILE_ITEMS - num_tile_items);
-
-        // Scatter flagged items
-        Scatter<IS_LAST_TILE, true>(
-            items,
-            selection_flags,
-            selection_indices,
-            num_tile_items,
-            num_tile_selections,
-            0,
-            0,
-            num_tile_selections);
-
-        return num_tile_selections;
-    }
-
-
-    /**
-     * Process subsequent tile of input (dynamic chained scan).  Returns the running count of selections (including this tile)
-     */
-    template <bool IS_LAST_TILE>
-    __device__ __forceinline__ OffsetT ConsumeSubsequentTile(
-        int                 num_tile_items,      ///< Number of input items comprising this tile
-        int                 tile_idx,           ///< Tile index
-        OffsetT             tile_offset,        ///< Tile offset
-        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
-    {
-        OutputT     items[ITEMS_PER_THREAD];
-        OffsetT     selection_flags[ITEMS_PER_THREAD];
-        OffsetT     selection_indices[ITEMS_PER_THREAD];
-
-        // Load items
-        if (IS_LAST_TILE)
-            BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items, num_tile_items);
-        else
-            BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items);
-
-        // Initialize selection_flags
-        InitializeSelections<false, IS_LAST_TILE>(
-            tile_offset,
-            num_tile_items,
-            items,
-            selection_flags,
-            Int2Type<SELECT_METHOD>());
-
-        CTA_SYNC();
-
-        // Exclusive scan of values and selection_flags
-        TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, cub::Sum(), tile_idx);
-        BlockScanT(temp_storage.scan).ExclusiveSum(selection_flags, selection_indices, prefix_op);
-
-        OffsetT num_tile_selections     = prefix_op.GetBlockAggregate();
-        OffsetT num_selections          = prefix_op.GetInclusivePrefix();
-        OffsetT num_selections_prefix   = prefix_op.GetExclusivePrefix();
-        OffsetT num_rejected_prefix     = (tile_idx * TILE_ITEMS) - num_selections_prefix;
-
-        // Discount any out-of-bounds selections
-        if (IS_LAST_TILE)
-        {
-            int num_discount    = TILE_ITEMS - num_tile_items;
-            num_selections      -= num_discount;
-            num_tile_selections -= num_discount;
-        }
-
-        // Scatter flagged items
-        Scatter<IS_LAST_TILE, false>(
-            items,
-            selection_flags,
-            selection_indices,
-            num_tile_items,
-            num_tile_selections,
-            num_selections_prefix,
-            num_rejected_prefix,
-            num_selections);
-
-        return num_selections;
-    }
-
-
-    /**
-     * Process a tile of input
-     */
-    template <bool IS_LAST_TILE>
-    __device__ __forceinline__ OffsetT ConsumeTile(
-        int                 num_tile_items,         ///< Number of input items comprising this tile
-        int                 tile_idx,           ///< Tile index
-        OffsetT             tile_offset,        ///< Tile offset
-        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
-    {
-        OffsetT num_selections;
-        if (tile_idx == 0)
-        {
-            num_selections = ConsumeFirstTile<IS_LAST_TILE>(num_tile_items, tile_offset, tile_state);
-        }
-        else
-        {
-            num_selections = ConsumeSubsequentTile<IS_LAST_TILE>(num_tile_items, tile_idx, tile_offset, tile_state);
-        }
-
-        return num_selections;
-    }
-
-
-    /**
-     * Scan tiles of items as part of a dynamic chained scan
-     */
-    template <typename NumSelectedIteratorT>        ///< Output iterator type for recording number of items selection_flags
-    __device__ __forceinline__ void ConsumeRange(
-        int                     num_tiles,          ///< Total number of input tiles
-        ScanTileStateT&         tile_state,         ///< Global tile state descriptor
-        NumSelectedIteratorT    d_num_selected_out) ///< Output total number selection_flags
-    {
-        // Blocks are launched in increasing order, so just assign one tile per block
-        int     tile_idx        = (blockIdx.x * gridDim.y) + blockIdx.y;    // Current tile index
-        OffsetT tile_offset     = tile_idx * TILE_ITEMS;                    // Global offset for the current tile
-
-        if (tile_idx < num_tiles - 1)
-        {
-            // Not the last tile (full)
-            ConsumeTile<false>(TILE_ITEMS, tile_idx, tile_offset, tile_state);
-        }
-        else
-        {
-            // The last tile (possibly partially-full)
-            OffsetT num_remaining   = num_items - tile_offset;
-            OffsetT num_selections  = ConsumeTile<true>(num_remaining, tile_idx, tile_offset, tile_state);
-
-            if (threadIdx.x == 0)
-            {
-                // Output the total number of items selection_flags
-                *d_num_selected_out = num_selections;
-            }
-        }
-    }
-
-};
-
-
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/agent/agent_spmv_orig.cuh b/thrust/system/cuda/detail/cub/agent/agent_spmv_orig.cuh
deleted file mode 100644
index 5a6c4c73c..000000000
--- a/thrust/system/cuda/detail/cub/agent/agent_spmv_orig.cuh
+++ /dev/null
@@ -1,670 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::AgentSpmv implements a stateful abstraction of CUDA thread blocks for participating in device-wide SpMV.
- */
-
-#pragma once
-
-#include <iterator>
-
-#include "../util_type.cuh"
-#include "../block/block_reduce.cuh"
-#include "../block/block_scan.cuh"
-#include "../block/block_exchange.cuh"
-#include "../thread/thread_search.cuh"
-#include "../thread/thread_operators.cuh"
-#include "../iterator/cache_modified_input_iterator.cuh"
-#include "../iterator/counting_input_iterator.cuh"
-#include "../iterator/tex_ref_input_iterator.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Tuning policy
- ******************************************************************************/
-
-/**
- * Parameterizable tuning policy type for AgentSpmv
- */
-template <
-    int                             _BLOCK_THREADS,                         ///< Threads per thread block
-    int                             _ITEMS_PER_THREAD,                      ///< Items per thread (per tile of input)
-    CacheLoadModifier               _ROW_OFFSETS_SEARCH_LOAD_MODIFIER,      ///< Cache load modifier for reading CSR row-offsets during search
-    CacheLoadModifier               _ROW_OFFSETS_LOAD_MODIFIER,             ///< Cache load modifier for reading CSR row-offsets
-    CacheLoadModifier               _COLUMN_INDICES_LOAD_MODIFIER,          ///< Cache load modifier for reading CSR column-indices
-    CacheLoadModifier               _VALUES_LOAD_MODIFIER,                  ///< Cache load modifier for reading CSR values
-    CacheLoadModifier               _VECTOR_VALUES_LOAD_MODIFIER,           ///< Cache load modifier for reading vector values
-    bool                            _DIRECT_LOAD_NONZEROS,                  ///< Whether to load nonzeros directly from global during sequential merging (vs. pre-staged through shared memory)
-    BlockScanAlgorithm              _SCAN_ALGORITHM>                        ///< The BlockScan algorithm to use
-struct AgentSpmvPolicy
-{
-    enum
-    {
-        BLOCK_THREADS                                                   = _BLOCK_THREADS,                       ///< Threads per thread block
-        ITEMS_PER_THREAD                                                = _ITEMS_PER_THREAD,                    ///< Items per thread (per tile of input)
-        DIRECT_LOAD_NONZEROS                                            = _DIRECT_LOAD_NONZEROS,                ///< Whether to load nonzeros directly from global during sequential merging (pre-staged through shared memory)
-    };
-
-    static const CacheLoadModifier  ROW_OFFSETS_SEARCH_LOAD_MODIFIER    = _ROW_OFFSETS_SEARCH_LOAD_MODIFIER;    ///< Cache load modifier for reading CSR row-offsets
-    static const CacheLoadModifier  ROW_OFFSETS_LOAD_MODIFIER           = _ROW_OFFSETS_LOAD_MODIFIER;           ///< Cache load modifier for reading CSR row-offsets
-    static const CacheLoadModifier  COLUMN_INDICES_LOAD_MODIFIER        = _COLUMN_INDICES_LOAD_MODIFIER;        ///< Cache load modifier for reading CSR column-indices
-    static const CacheLoadModifier  VALUES_LOAD_MODIFIER                = _VALUES_LOAD_MODIFIER;                ///< Cache load modifier for reading CSR values
-    static const CacheLoadModifier  VECTOR_VALUES_LOAD_MODIFIER         = _VECTOR_VALUES_LOAD_MODIFIER;         ///< Cache load modifier for reading vector values
-    static const BlockScanAlgorithm SCAN_ALGORITHM                      = _SCAN_ALGORITHM;                      ///< The BlockScan algorithm to use
-
-};
-
-
-/******************************************************************************
- * Thread block abstractions
- ******************************************************************************/
-
-template <
-    typename        ValueT,              ///< Matrix and vector value type
-    typename        OffsetT>             ///< Signed integer type for sequence offsets
-struct SpmvParams
-{
-    ValueT*         d_values;            ///< Pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix <b>A</b>.
-    OffsetT*        d_row_end_offsets;   ///< Pointer to the array of \p m offsets demarcating the end of every row in \p d_column_indices and \p d_values
-    OffsetT*        d_column_indices;    ///< Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix <b>A</b>.  (Indices are zero-valued.)
-    ValueT*         d_vector_x;          ///< Pointer to the array of \p num_cols values corresponding to the dense input vector <em>x</em>
-    ValueT*         d_vector_y;          ///< Pointer to the array of \p num_rows values corresponding to the dense output vector <em>y</em>
-    int             num_rows;            ///< Number of rows of matrix <b>A</b>.
-    int             num_cols;            ///< Number of columns of matrix <b>A</b>.
-    int             num_nonzeros;        ///< Number of nonzero elements of matrix <b>A</b>.
-    ValueT          alpha;               ///< Alpha multiplicand
-    ValueT          beta;                ///< Beta addend-multiplicand
-
-    TexRefInputIterator<ValueT, 66778899, OffsetT>  t_vector_x;
-};
-
-
-/**
- * \brief AgentSpmv implements a stateful abstraction of CUDA thread blocks for participating in device-wide SpMV.
- */
-template <
-    typename    AgentSpmvPolicyT,           ///< Parameterized AgentSpmvPolicy tuning policy type
-    typename    ValueT,                     ///< Matrix and vector value type
-    typename    OffsetT,                    ///< Signed integer type for sequence offsets
-    bool        HAS_ALPHA,                  ///< Whether the input parameter \p alpha is 1
-    bool        HAS_BETA,                   ///< Whether the input parameter \p beta is 0
-    int         PTX_ARCH = CUB_PTX_ARCH>    ///< PTX compute capability
-struct AgentSpmv
-{
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    /// Constants
-    enum
-    {
-        BLOCK_THREADS           = AgentSpmvPolicyT::BLOCK_THREADS,
-        ITEMS_PER_THREAD        = AgentSpmvPolicyT::ITEMS_PER_THREAD,
-        TILE_ITEMS              = BLOCK_THREADS * ITEMS_PER_THREAD,
-    };
-
-    /// 2D merge path coordinate type
-    typedef typename CubVector<OffsetT, 2>::Type CoordinateT;
-
-    /// Input iterator wrapper types (for applying cache modifiers)
-
-    typedef CacheModifiedInputIterator<
-            AgentSpmvPolicyT::ROW_OFFSETS_SEARCH_LOAD_MODIFIER,
-            OffsetT,
-            OffsetT>
-        RowOffsetsSearchIteratorT;
-
-    typedef CacheModifiedInputIterator<
-            AgentSpmvPolicyT::ROW_OFFSETS_LOAD_MODIFIER,
-            OffsetT,
-            OffsetT>
-        RowOffsetsIteratorT;
-
-    typedef CacheModifiedInputIterator<
-            AgentSpmvPolicyT::COLUMN_INDICES_LOAD_MODIFIER,
-            OffsetT,
-            OffsetT>
-        ColumnIndicesIteratorT;
-
-    typedef CacheModifiedInputIterator<
-            AgentSpmvPolicyT::VALUES_LOAD_MODIFIER,
-            ValueT,
-            OffsetT>
-        ValueIteratorT;
-
-    typedef CacheModifiedInputIterator<
-            AgentSpmvPolicyT::VECTOR_VALUES_LOAD_MODIFIER,
-            ValueT,
-            OffsetT>
-        VectorValueIteratorT;
-
-    // Tuple type for scanning (pairs accumulated segment-value with segment-index)
-    typedef KeyValuePair<OffsetT, ValueT> KeyValuePairT;
-
-    // Reduce-value-by-segment scan operator
-    typedef ReduceByKeyOp<cub::Sum> ReduceBySegmentOpT;
-
-    // BlockReduce specialization
-    typedef BlockReduce<
-            ValueT,
-            BLOCK_THREADS,
-            BLOCK_REDUCE_WARP_REDUCTIONS>
-        BlockReduceT;
-
-    // BlockScan specialization
-    typedef BlockScan<
-            KeyValuePairT,
-            BLOCK_THREADS,
-            AgentSpmvPolicyT::SCAN_ALGORITHM>
-        BlockScanT;
-
-    // BlockScan specialization
-    typedef BlockScan<
-            ValueT,
-            BLOCK_THREADS,
-            AgentSpmvPolicyT::SCAN_ALGORITHM>
-        BlockPrefixSumT;
-
-    // BlockExchange specialization
-    typedef BlockExchange<
-            ValueT,
-            BLOCK_THREADS,
-            ITEMS_PER_THREAD>
-        BlockExchangeT;
-
-    /// Merge item type (either a non-zero value or a row-end offset)
-    union MergeItem
-    {
-        // Value type to pair with index type OffsetT (NullType if loading values directly during merge)
-        typedef typename If<AgentSpmvPolicyT::DIRECT_LOAD_NONZEROS, NullType, ValueT>::Type MergeValueT;
-
-        OffsetT     row_end_offset;
-        MergeValueT nonzero;
-    };
-
-    /// Shared memory type required by this thread block
-    struct _TempStorage
-    {
-        CoordinateT tile_coords[2];
-
-        union Aliasable
-        {
-            // Smem needed for tile of merge items
-            MergeItem merge_items[ITEMS_PER_THREAD + TILE_ITEMS + 1];
-
-            // Smem needed for block exchange
-            typename BlockExchangeT::TempStorage exchange;
-
-            // Smem needed for block-wide reduction
-            typename BlockReduceT::TempStorage reduce;
-
-            // Smem needed for tile scanning
-            typename BlockScanT::TempStorage scan;
-
-            // Smem needed for tile prefix sum
-            typename BlockPrefixSumT::TempStorage prefix_sum;
-
-        } aliasable;
-    };
-
-    /// Temporary storage type (unionable)
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    //---------------------------------------------------------------------
-    // Per-thread fields
-    //---------------------------------------------------------------------
-
-
-    _TempStorage&                   temp_storage;         /// Reference to temp_storage
-
-    SpmvParams<ValueT, OffsetT>&    spmv_params;
-
-    ValueIteratorT                  wd_values;            ///< Wrapped pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix <b>A</b>.
-    RowOffsetsIteratorT             wd_row_end_offsets;   ///< Wrapped Pointer to the array of \p m offsets demarcating the end of every row in \p d_column_indices and \p d_values
-    ColumnIndicesIteratorT          wd_column_indices;    ///< Wrapped Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix <b>A</b>.  (Indices are zero-valued.)
-    VectorValueIteratorT            wd_vector_x;          ///< Wrapped Pointer to the array of \p num_cols values corresponding to the dense input vector <em>x</em>
-    VectorValueIteratorT            wd_vector_y;          ///< Wrapped Pointer to the array of \p num_cols values corresponding to the dense input vector <em>x</em>
-
-
-    //---------------------------------------------------------------------
-    // Interface
-    //---------------------------------------------------------------------
-
-    /**
-     * Constructor
-     */
-    __device__ __forceinline__ AgentSpmv(
-        TempStorage&                    temp_storage,           ///< Reference to temp_storage
-        SpmvParams<ValueT, OffsetT>&    spmv_params)            ///< SpMV input parameter bundle
-    :
-        temp_storage(temp_storage.Alias()),
-        spmv_params(spmv_params),
-        wd_values(spmv_params.d_values),
-        wd_row_end_offsets(spmv_params.d_row_end_offsets),
-        wd_column_indices(spmv_params.d_column_indices),
-        wd_vector_x(spmv_params.d_vector_x),
-        wd_vector_y(spmv_params.d_vector_y)
-    {}
-
-
-
-
-    /**
-     * Consume a merge tile, specialized for direct-load of nonzeros
-     */
-    __device__ __forceinline__ KeyValuePairT ConsumeTile(
-        int             tile_idx,
-        CoordinateT     tile_start_coord,
-        CoordinateT     tile_end_coord,
-        Int2Type<true>  is_direct_load)     ///< Marker type indicating whether to load nonzeros directly during path-discovery or beforehand in batch
-    {
-        int         tile_num_rows           = tile_end_coord.x - tile_start_coord.x;
-        int         tile_num_nonzeros       = tile_end_coord.y - tile_start_coord.y;
-        OffsetT*    s_tile_row_end_offsets  = &temp_storage.aliasable.merge_items[0].row_end_offset;
-
-        // Gather the row end-offsets for the merge tile into shared memory
-        for (int item = threadIdx.x; item <= tile_num_rows; item += BLOCK_THREADS)
-        {
-            s_tile_row_end_offsets[item] = wd_row_end_offsets[tile_start_coord.x + item];
-        }
-
-        CTA_SYNC();
-
-        // Search for the thread's starting coordinate within the merge tile
-        CountingInputIterator<OffsetT>  tile_nonzero_indices(tile_start_coord.y);
-        CoordinateT                     thread_start_coord;
-
-        MergePathSearch(
-            OffsetT(threadIdx.x * ITEMS_PER_THREAD),    // Diagonal
-            s_tile_row_end_offsets,                     // List A
-            tile_nonzero_indices,                       // List B
-            tile_num_rows,
-            tile_num_nonzeros,
-            thread_start_coord);
-
-        CTA_SYNC();            // Perf-sync
-
-        // Compute the thread's merge path segment
-        CoordinateT     thread_current_coord = thread_start_coord;
-        KeyValuePairT   scan_segment[ITEMS_PER_THREAD];
-
-        ValueT          running_total = 0.0;
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            OffsetT nonzero_idx         = CUB_MIN(tile_nonzero_indices[thread_current_coord.y], spmv_params.num_nonzeros - 1);
-            OffsetT column_idx          = wd_column_indices[nonzero_idx];
-            ValueT  value               = wd_values[nonzero_idx];
-
-            ValueT  vector_value        = spmv_params.t_vector_x[column_idx];
-#if (CUB_PTX_ARCH >= 350)
-            vector_value                = wd_vector_x[column_idx];
-#endif
-            ValueT  nonzero             = value * vector_value;
-
-            OffsetT row_end_offset      = s_tile_row_end_offsets[thread_current_coord.x];
-
-            if (tile_nonzero_indices[thread_current_coord.y] < row_end_offset)
-            {
-                // Move down (accumulate)
-                running_total += nonzero;
-                scan_segment[ITEM].value    = running_total;
-                scan_segment[ITEM].key      = tile_num_rows;
-                ++thread_current_coord.y;
-            }
-            else
-            {
-                // Move right (reset)
-                scan_segment[ITEM].value    = running_total;
-                scan_segment[ITEM].key      = thread_current_coord.x;
-                running_total               = 0.0;
-                ++thread_current_coord.x;
-            }
-        }
-
-        CTA_SYNC();
-
-        // Block-wide reduce-value-by-segment
-        KeyValuePairT       tile_carry;
-        ReduceBySegmentOpT  scan_op;
-        KeyValuePairT       scan_item;
-
-        scan_item.value = running_total;
-        scan_item.key   = thread_current_coord.x;
-
-        BlockScanT(temp_storage.aliasable.scan).ExclusiveScan(scan_item, scan_item, scan_op, tile_carry);
-
-        if (tile_num_rows > 0)
-        {
-            if (threadIdx.x == 0)
-                scan_item.key = -1;
-
-            // Direct scatter
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-            {
-                if (scan_segment[ITEM].key < tile_num_rows)
-                {
-                    if (scan_item.key == scan_segment[ITEM].key)
-                        scan_segment[ITEM].value = scan_item.value + scan_segment[ITEM].value;
-
-                    if (HAS_ALPHA)
-                    {
-                        scan_segment[ITEM].value *= spmv_params.alpha;
-                    }
-
-                    if (HAS_BETA)
-                    {
-                        // Update the output vector element
-                        ValueT addend = spmv_params.beta * wd_vector_y[tile_start_coord.x + scan_segment[ITEM].key];
-                        scan_segment[ITEM].value += addend;
-                    }
-
-                    // Set the output vector element
-                    spmv_params.d_vector_y[tile_start_coord.x + scan_segment[ITEM].key] = scan_segment[ITEM].value;
-                }
-            }
-        }
-
-        // Return the tile's running carry-out
-        return tile_carry;
-    }
-
-
-
-    /**
-     * Consume a merge tile, specialized for indirect load of nonzeros
-     */
-    __device__ __forceinline__ KeyValuePairT ConsumeTile(
-        int             tile_idx,
-        CoordinateT     tile_start_coord,
-        CoordinateT     tile_end_coord,
-        Int2Type<false> is_direct_load)     ///< Marker type indicating whether to load nonzeros directly during path-discovery or beforehand in batch
-    {
-        int         tile_num_rows           = tile_end_coord.x - tile_start_coord.x;
-        int         tile_num_nonzeros       = tile_end_coord.y - tile_start_coord.y;
-
-#if (CUB_PTX_ARCH >= 520)
-
-        OffsetT*    s_tile_row_end_offsets  = &temp_storage.aliasable.merge_items[0].row_end_offset;
-        ValueT*     s_tile_nonzeros         = &temp_storage.aliasable.merge_items[tile_num_rows + ITEMS_PER_THREAD].nonzero;
-
-        // Gather the nonzeros for the merge tile into shared memory
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            int nonzero_idx = threadIdx.x + (ITEM * BLOCK_THREADS);
-
-            ValueIteratorT a                = wd_values + tile_start_coord.y + nonzero_idx;
-            ColumnIndicesIteratorT ci       = wd_column_indices + tile_start_coord.y + nonzero_idx;
-            ValueT* s                       = s_tile_nonzeros + nonzero_idx;
-
-            if (nonzero_idx < tile_num_nonzeros)
-            {
-
-                OffsetT column_idx              = *ci;
-                ValueT  value                   = *a;
-
-                ValueT  vector_value            = spmv_params.t_vector_x[column_idx];
-                vector_value                    = wd_vector_x[column_idx];
-
-                ValueT  nonzero                 = value * vector_value;
-
-                *s    = nonzero;
-            }
-        }
-
-
-#else
-
-        OffsetT*    s_tile_row_end_offsets  = &temp_storage.aliasable.merge_items[0].row_end_offset;
-        ValueT*     s_tile_nonzeros         = &temp_storage.aliasable.merge_items[tile_num_rows + ITEMS_PER_THREAD].nonzero;
-
-        // Gather the nonzeros for the merge tile into shared memory
-        if (tile_num_nonzeros > 0)
-        {
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-            {
-                int     nonzero_idx             = threadIdx.x + (ITEM * BLOCK_THREADS);
-                nonzero_idx                     = CUB_MIN(nonzero_idx, tile_num_nonzeros - 1);
-
-                OffsetT column_idx              = wd_column_indices[tile_start_coord.y + nonzero_idx];
-                ValueT  value                   = wd_values[tile_start_coord.y + nonzero_idx];
-
-                ValueT  vector_value            = spmv_params.t_vector_x[column_idx];
-#if (CUB_PTX_ARCH >= 350)
-                vector_value                    = wd_vector_x[column_idx];
-#endif
-                ValueT  nonzero                 = value * vector_value;
-
-                s_tile_nonzeros[nonzero_idx]    = nonzero;
-            }
-        }
-
-#endif
-
-        // Gather the row end-offsets for the merge tile into shared memory
-        #pragma unroll 1
-        for (int item = threadIdx.x; item <= tile_num_rows; item += BLOCK_THREADS)
-        {
-            s_tile_row_end_offsets[item] = wd_row_end_offsets[tile_start_coord.x + item];
-        }
-
-        CTA_SYNC();
-
-        // Search for the thread's starting coordinate within the merge tile
-        CountingInputIterator<OffsetT>  tile_nonzero_indices(tile_start_coord.y);
-        CoordinateT                     thread_start_coord;
-
-        MergePathSearch(
-            OffsetT(threadIdx.x * ITEMS_PER_THREAD),    // Diagonal
-            s_tile_row_end_offsets,                     // List A
-            tile_nonzero_indices,                       // List B
-            tile_num_rows,
-            tile_num_nonzeros,
-            thread_start_coord);
-
-        CTA_SYNC();            // Perf-sync
-
-        // Compute the thread's merge path segment
-        CoordinateT     thread_current_coord = thread_start_coord;
-        KeyValuePairT   scan_segment[ITEMS_PER_THREAD];
-        ValueT          running_total = 0.0;
-
-        OffsetT row_end_offset  = s_tile_row_end_offsets[thread_current_coord.x];
-        ValueT  nonzero         = s_tile_nonzeros[thread_current_coord.y];
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            if (tile_nonzero_indices[thread_current_coord.y] < row_end_offset)
-            {
-                // Move down (accumulate)
-                scan_segment[ITEM].value    = nonzero;
-                running_total               += nonzero;
-                ++thread_current_coord.y;
-                nonzero                     = s_tile_nonzeros[thread_current_coord.y];
-            }
-            else
-            {
-                // Move right (reset)
-                scan_segment[ITEM].value    = 0.0;
-                running_total               = 0.0;
-                ++thread_current_coord.x;
-                row_end_offset              = s_tile_row_end_offsets[thread_current_coord.x];
-            }
-
-            scan_segment[ITEM].key = thread_current_coord.x;
-        }
-
-        CTA_SYNC();
-
-        // Block-wide reduce-value-by-segment
-        KeyValuePairT       tile_carry;
-        ReduceBySegmentOpT  scan_op;
-        KeyValuePairT       scan_item;
-
-        scan_item.value = running_total;
-        scan_item.key = thread_current_coord.x;
-
-        BlockScanT(temp_storage.aliasable.scan).ExclusiveScan(scan_item, scan_item, scan_op, tile_carry);
-
-        if (threadIdx.x == 0)
-        {
-            scan_item.key = thread_start_coord.x;
-            scan_item.value = 0.0;
-        }
-
-        if (tile_num_rows > 0)
-        {
-
-            CTA_SYNC();
-
-            // Scan downsweep and scatter
-            ValueT* s_partials = &temp_storage.aliasable.merge_items[0].nonzero;
-
-            if (scan_item.key != scan_segment[0].key)
-            {
-                s_partials[scan_item.key] = scan_item.value;
-            }
-            else
-            {
-                scan_segment[0].value += scan_item.value;
-            }
-
-            #pragma unroll
-            for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM)
-            {
-                if (scan_segment[ITEM - 1].key != scan_segment[ITEM].key)
-                {
-                    s_partials[scan_segment[ITEM - 1].key] = scan_segment[ITEM - 1].value;
-                }
-                else
-                {
-                    scan_segment[ITEM].value += scan_segment[ITEM - 1].value;
-                }
-            }
-
-            CTA_SYNC();
-
-            #pragma unroll 1
-            for (int item = threadIdx.x; item < tile_num_rows; item += BLOCK_THREADS)
-            {
-                spmv_params.d_vector_y[tile_start_coord.x + item] = s_partials[item];
-            }
-        }
-
-        // Return the tile's running carry-out
-        return tile_carry;
-    }
-
-
-    /**
-     * Consume input tile
-     */
-    __device__ __forceinline__ void ConsumeTile(
-        CoordinateT*    d_tile_coordinates,     ///< [in] Pointer to the temporary array of tile starting coordinates
-        KeyValuePairT*  d_tile_carry_pairs,     ///< [out] Pointer to the temporary array carry-out dot product row-ids, one per block
-        int             num_merge_tiles)        ///< [in] Number of merge tiles
-    {
-        int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y;    // Current tile index
-
-        if (tile_idx >= num_merge_tiles)
-            return;
-
-        // Read our starting coordinates
-        if (threadIdx.x < 2)
-        {
-            if (d_tile_coordinates == NULL)
-            {
-                // Search our starting coordinates
-                OffsetT                         diagonal = (tile_idx + threadIdx.x) * TILE_ITEMS;
-                CoordinateT                     tile_coord;
-                CountingInputIterator<OffsetT>  nonzero_indices(0);
-
-                // Search the merge path
-                MergePathSearch(
-                    diagonal,
-                    RowOffsetsSearchIteratorT(spmv_params.d_row_end_offsets),
-                    nonzero_indices,
-                    spmv_params.num_rows,
-                    spmv_params.num_nonzeros,
-                    tile_coord);
-
-                temp_storage.tile_coords[threadIdx.x] = tile_coord;
-            }
-            else
-            {
-                temp_storage.tile_coords[threadIdx.x] = d_tile_coordinates[tile_idx + threadIdx.x];
-            }
-        }
-
-        CTA_SYNC();
-
-        CoordinateT tile_start_coord     = temp_storage.tile_coords[0];
-        CoordinateT tile_end_coord       = temp_storage.tile_coords[1];
-
-        // Consume multi-segment tile
-        KeyValuePairT tile_carry = ConsumeTile(
-            tile_idx,
-            tile_start_coord,
-            tile_end_coord,
-            Int2Type<AgentSpmvPolicyT::DIRECT_LOAD_NONZEROS>());
-
-        // Output the tile's carry-out
-        if (threadIdx.x == 0)
-        {
-            if (HAS_ALPHA)
-                tile_carry.value *= spmv_params.alpha;
-
-            tile_carry.key += tile_start_coord.x;
-            d_tile_carry_pairs[tile_idx]    = tile_carry;
-        }
-    }
-
-
-};
-
-
-
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/agent/single_pass_scan_operators.cuh b/thrust/system/cuda/detail/cub/agent/single_pass_scan_operators.cuh
deleted file mode 100644
index fd76add77..000000000
--- a/thrust/system/cuda/detail/cub/agent/single_pass_scan_operators.cuh
+++ /dev/null
@@ -1,815 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Callback operator types for supplying BlockScan prefixes
- */
-
-#pragma once
-
-#include <iterator>
-
-#include "../thread/thread_load.cuh"
-#include "../thread/thread_store.cuh"
-#include "../warp/warp_reduce.cuh"
-#include "../util_arch.cuh"
-#include "../util_device.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Prefix functor type for maintaining a running prefix while scanning a
- * region independent of other thread blocks
- ******************************************************************************/
-
-/**
- * Stateful callback operator type for supplying BlockScan prefixes.
- * Maintains a running prefix that can be applied to consecutive
- * BlockScan operations.
- */
-template <
-    typename T,                 ///< BlockScan value type
-    typename ScanOpT>            ///< Wrapped scan operator type
-struct BlockScanRunningPrefixOp
-{
-    ScanOpT     op;                 ///< Wrapped scan operator
-    T           running_total;      ///< Running block-wide prefix
-
-    /// Constructor
-    __device__ __forceinline__ BlockScanRunningPrefixOp(ScanOpT op)
-    :
-        op(op)
-    {}
-
-    /// Constructor
-    __device__ __forceinline__ BlockScanRunningPrefixOp(
-        T starting_prefix,
-        ScanOpT op)
-    :
-        op(op),
-        running_total(starting_prefix)
-    {}
-
-    /**
-     * Prefix callback operator.  Returns the block-wide running_total in thread-0.
-     */
-    __device__ __forceinline__ T operator()(
-        const T &block_aggregate)              ///< The aggregate sum of the BlockScan inputs
-    {
-        T retval = running_total;
-        running_total = op(running_total, block_aggregate);
-        return retval;
-    }
-};
-
-
-/******************************************************************************
- * Generic tile status interface types for block-cooperative scans
- ******************************************************************************/
-
-/**
- * Enumerations of tile status
- */
-enum ScanTileStatus
-{
-    SCAN_TILE_OOB,          // Out-of-bounds (e.g., padding)
-    SCAN_TILE_INVALID = 99, // Not yet processed
-    SCAN_TILE_PARTIAL,      // Tile aggregate is available
-    SCAN_TILE_INCLUSIVE,    // Inclusive tile prefix is available
-};
-
-
-/**
- * Tile status interface.
- */
-template <
-    typename    T,
-    bool        SINGLE_WORD = Traits<T>::PRIMITIVE>
-struct ScanTileState;
-
-
-/**
- * Tile status interface specialized for scan status and value types
- * that can be combined into one machine word that can be
- * read/written coherently in a single access.
- */
-template <typename T>
-struct ScanTileState<T, true>
-{
-    // Status word type
-    typedef typename If<(sizeof(T) == 8),
-        long long,
-        typename If<(sizeof(T) == 4),
-            int,
-            typename If<(sizeof(T) == 2),
-                short,
-                char>::Type>::Type>::Type StatusWord;
-
-
-    // Unit word type
-    typedef typename If<(sizeof(T) == 8),
-        longlong2,
-        typename If<(sizeof(T) == 4),
-            int2,
-            typename If<(sizeof(T) == 2),
-                int,
-                uchar2>::Type>::Type>::Type TxnWord;
-
-
-    // Device word type
-    struct TileDescriptor
-    {
-        StatusWord  status;
-        T           value;
-    };
-
-
-    // Constants
-    enum
-    {
-        TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS,
-    };
-
-
-    // Device storage
-    TxnWord *d_tile_descriptors;
-
-    /// Constructor
-    __host__ __device__ __forceinline__
-    ScanTileState()
-    :
-        d_tile_descriptors(NULL)
-    {}
-
-
-    /// Initializer
-    __host__ __device__ __forceinline__
-    cudaError_t Init(
-        int     /*num_tiles*/,                      ///< [in] Number of tiles
-        void    *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t  /*temp_storage_bytes*/)             ///< [in] Size in bytes of \t d_temp_storage allocation
-    {
-        d_tile_descriptors = reinterpret_cast<TxnWord*>(d_temp_storage);
-        return cudaSuccess;
-    }
-
-
-    /**
-     * Compute device memory needed for tile status
-     */
-    __host__ __device__ __forceinline__
-    static cudaError_t AllocationSize(
-        int     num_tiles,                          ///< [in] Number of tiles
-        size_t  &temp_storage_bytes)                ///< [out] Size in bytes of \t d_temp_storage allocation
-    {
-        temp_storage_bytes = (num_tiles + TILE_STATUS_PADDING) * sizeof(TileDescriptor);       // bytes needed for tile status descriptors
-        return cudaSuccess;
-    }
-
-
-    /**
-     * Initialize (from device)
-     */
-    __device__ __forceinline__ void InitializeStatus(int num_tiles)
-    {
-        int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
-
-        TxnWord val = TxnWord();
-        TileDescriptor *descriptor = reinterpret_cast<TileDescriptor*>(&val);
-
-        if (tile_idx < num_tiles)
-        {
-            // Not-yet-set
-            descriptor->status = StatusWord(SCAN_TILE_INVALID);
-            d_tile_descriptors[TILE_STATUS_PADDING + tile_idx] = val;
-        }
-
-        if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING))
-        {
-            // Padding
-            descriptor->status = StatusWord(SCAN_TILE_OOB);
-            d_tile_descriptors[threadIdx.x] = val;
-        }
-    }
-
-
-    /**
-     * Update the specified tile's inclusive value and corresponding status
-     */
-    __device__ __forceinline__ void SetInclusive(int tile_idx, T tile_inclusive)
-    {
-        TileDescriptor tile_descriptor;
-        tile_descriptor.status = SCAN_TILE_INCLUSIVE;
-        tile_descriptor.value = tile_inclusive;
-
-        TxnWord alias;
-        *reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor;
-        ThreadStore<STORE_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias);
-    }
-
-
-    /**
-     * Update the specified tile's partial value and corresponding status
-     */
-    __device__ __forceinline__ void SetPartial(int tile_idx, T tile_partial)
-    {
-        TileDescriptor tile_descriptor;
-        tile_descriptor.status = SCAN_TILE_PARTIAL;
-        tile_descriptor.value = tile_partial;
-
-        TxnWord alias;
-        *reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor;
-        ThreadStore<STORE_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias);
-    }
-
-    /**
-     * Wait for the corresponding tile to become non-invalid
-     */
-    __device__ __forceinline__ void WaitForValid(
-        int             tile_idx,
-        StatusWord      &status,
-        T               &value)
-    {
-        TileDescriptor tile_descriptor;
-        do
-        {
-            __threadfence_block(); // prevent hoisting loads from loop
-            TxnWord alias = ThreadLoad<LOAD_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx);
-            tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
-
-        } while (WARP_ANY((tile_descriptor.status == SCAN_TILE_INVALID), 0xffffffff));
-
-        status = tile_descriptor.status;
-        value = tile_descriptor.value;
-    }
-
-};
-
-
-
-/**
- * Tile status interface specialized for scan status and value types that
- * cannot be combined into one machine word.
- */
-template <typename T>
-struct ScanTileState<T, false>
-{
-    // Status word type
-    typedef char StatusWord;
-
-    // Constants
-    enum
-    {
-        TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS,
-    };
-
-    // Device storage
-    StatusWord  *d_tile_status;
-    T           *d_tile_partial;
-    T           *d_tile_inclusive;
-
-    /// Constructor
-    __host__ __device__ __forceinline__
-    ScanTileState()
-    :
-        d_tile_status(NULL),
-        d_tile_partial(NULL),
-        d_tile_inclusive(NULL)
-    {}
-
-
-    /// Initializer
-    __host__ __device__ __forceinline__
-    cudaError_t Init(
-        int     num_tiles,                          ///< [in] Number of tiles
-        void    *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t  temp_storage_bytes)                 ///< [in] Size in bytes of \t d_temp_storage allocation
-    {
-        cudaError_t error = cudaSuccess;
-        do
-        {
-            void*   allocations[3] = { NULL, NULL, NULL };
-            size_t  allocation_sizes[3];
-
-            allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * sizeof(StatusWord);           // bytes needed for tile status descriptors
-            allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>);     // bytes needed for partials
-            allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>);     // bytes needed for inclusives
-
-            // Compute allocation pointers into the single storage blob
-            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
-
-            // Alias the offsets
-            d_tile_status       = reinterpret_cast<StatusWord*>(allocations[0]);
-            d_tile_partial      = reinterpret_cast<T*>(allocations[1]);
-            d_tile_inclusive    = reinterpret_cast<T*>(allocations[2]);
-        }
-        while (0);
-
-        return error;
-    }
-
-
-    /**
-     * Compute device memory needed for tile status
-     */
-    __host__ __device__ __forceinline__
-    static cudaError_t AllocationSize(
-        int     num_tiles,                          ///< [in] Number of tiles
-        size_t  &temp_storage_bytes)                ///< [out] Size in bytes of \t d_temp_storage allocation
-    {
-        // Specify storage allocation requirements
-        size_t  allocation_sizes[3];
-        allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * sizeof(StatusWord);         // bytes needed for tile status descriptors
-        allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>);   // bytes needed for partials
-        allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>);   // bytes needed for inclusives
-
-        // Set the necessary size of the blob
-        void* allocations[3];
-        return CubDebug(AliasTemporaries(NULL, temp_storage_bytes, allocations, allocation_sizes));
-    }
-
-
-    /**
-     * Initialize (from device)
-     */
-    __device__ __forceinline__ void InitializeStatus(int num_tiles)
-    {
-        int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
-        if (tile_idx < num_tiles)
-        {
-            // Not-yet-set
-            d_tile_status[TILE_STATUS_PADDING + tile_idx] = StatusWord(SCAN_TILE_INVALID);
-        }
-
-        if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING))
-        {
-            // Padding
-            d_tile_status[threadIdx.x] = StatusWord(SCAN_TILE_OOB);
-        }
-    }
-
-
-    /**
-     * Update the specified tile's inclusive value and corresponding status
-     */
-    __device__ __forceinline__ void SetInclusive(int tile_idx, T tile_inclusive)
-    {
-        // Update tile inclusive value
-        ThreadStore<STORE_CG>(d_tile_inclusive + TILE_STATUS_PADDING + tile_idx, tile_inclusive);
-
-        // Fence
-        __threadfence();
-
-        // Update tile status
-        ThreadStore<STORE_CG>(d_tile_status + TILE_STATUS_PADDING + tile_idx, StatusWord(SCAN_TILE_INCLUSIVE));
-    }
-
-
-    /**
-     * Update the specified tile's partial value and corresponding status
-     */
-    __device__ __forceinline__ void SetPartial(int tile_idx, T tile_partial)
-    {
-        // Update tile partial value
-        ThreadStore<STORE_CG>(d_tile_partial + TILE_STATUS_PADDING + tile_idx, tile_partial);
-
-        // Fence
-        __threadfence();
-
-        // Update tile status
-        ThreadStore<STORE_CG>(d_tile_status + TILE_STATUS_PADDING + tile_idx, StatusWord(SCAN_TILE_PARTIAL));
-    }
-
-    /**
-     * Wait for the corresponding tile to become non-invalid
-     */
-    __device__ __forceinline__ void WaitForValid(
-        int             tile_idx,
-        StatusWord      &status,
-        T               &value)
-    {
-        do {
-            status = ThreadLoad<LOAD_CG>(d_tile_status + TILE_STATUS_PADDING + tile_idx);
-
-            __threadfence();    // prevent hoisting loads from loop or loads below above this one
-
-        } while (status == SCAN_TILE_INVALID);
-
-        if (status == StatusWord(SCAN_TILE_PARTIAL)) 
-            value = ThreadLoad<LOAD_CG>(d_tile_partial + TILE_STATUS_PADDING + tile_idx);
-        else
-            value = ThreadLoad<LOAD_CG>(d_tile_inclusive + TILE_STATUS_PADDING + tile_idx);
-    }
-};
-
-
-/******************************************************************************
- * ReduceByKey tile status interface types for block-cooperative scans
- ******************************************************************************/
-
-/**
- * Tile status interface for reduction by key.
- *
- */
-template <
-    typename    ValueT,
-    typename    KeyT,
-    bool        SINGLE_WORD = (Traits<ValueT>::PRIMITIVE) && (sizeof(ValueT) + sizeof(KeyT) < 16)>
-struct ReduceByKeyScanTileState;
-
-
-/**
- * Tile status interface for reduction by key, specialized for scan status and value types that
- * cannot be combined into one machine word.
- */
-template <
-    typename    ValueT,
-    typename    KeyT>
-struct ReduceByKeyScanTileState<ValueT, KeyT, false> :
-    ScanTileState<KeyValuePair<KeyT, ValueT> >
-{
-    typedef ScanTileState<KeyValuePair<KeyT, ValueT> > SuperClass;
-
-    /// Constructor
-    __host__ __device__ __forceinline__
-    ReduceByKeyScanTileState() : SuperClass() {}
-};
-
-
-/**
- * Tile status interface for reduction by key, specialized for scan status and value types that
- * can be combined into one machine word that can be read/written coherently in a single access.
- */
-template <
-    typename ValueT,
-    typename KeyT>
-struct ReduceByKeyScanTileState<ValueT, KeyT, true>
-{
-    typedef KeyValuePair<KeyT, ValueT>KeyValuePairT;
-
-    // Constants
-    enum
-    {
-        PAIR_SIZE           = sizeof(ValueT) + sizeof(KeyT),
-        TXN_WORD_SIZE       = 1 << Log2<PAIR_SIZE + 1>::VALUE,
-        STATUS_WORD_SIZE    = TXN_WORD_SIZE - PAIR_SIZE,
-
-        TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS,
-    };
-
-    // Status word type
-    typedef typename If<(STATUS_WORD_SIZE == 8),
-        long long,
-        typename If<(STATUS_WORD_SIZE == 4),
-            int,
-            typename If<(STATUS_WORD_SIZE == 2),
-                short,
-                char>::Type>::Type>::Type StatusWord;
-
-    // Status word type
-    typedef typename If<(TXN_WORD_SIZE == 16),
-        longlong2,
-        typename If<(TXN_WORD_SIZE == 8),
-            long long,
-            int>::Type>::Type TxnWord;
-
-    // Device word type (for when sizeof(ValueT) == sizeof(KeyT))
-    struct TileDescriptorBigStatus
-    {
-        KeyT        key;
-        ValueT      value;
-        StatusWord  status;
-    };
-
-    // Device word type (for when sizeof(ValueT) != sizeof(KeyT))
-    struct TileDescriptorLittleStatus
-    {
-        ValueT      value;
-        StatusWord  status;
-        KeyT        key;
-    };
-
-    // Device word type
-    typedef typename If<
-            (sizeof(ValueT) == sizeof(KeyT)),
-            TileDescriptorBigStatus,
-            TileDescriptorLittleStatus>::Type
-        TileDescriptor;
-
-
-    // Device storage
-    TxnWord *d_tile_descriptors;
-
-
-    /// Constructor
-    __host__ __device__ __forceinline__
-    ReduceByKeyScanTileState()
-    :
-        d_tile_descriptors(NULL)
-    {}
-
-
-    /// Initializer
-    __host__ __device__ __forceinline__
-    cudaError_t Init(
-        int     /*num_tiles*/,                      ///< [in] Number of tiles
-        void    *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t  /*temp_storage_bytes*/)             ///< [in] Size in bytes of \t d_temp_storage allocation
-    {
-        d_tile_descriptors = reinterpret_cast<TxnWord*>(d_temp_storage);
-        return cudaSuccess;
-    }
-
-
-    /**
-     * Compute device memory needed for tile status
-     */
-    __host__ __device__ __forceinline__
-    static cudaError_t AllocationSize(
-        int     num_tiles,                          ///< [in] Number of tiles
-        size_t  &temp_storage_bytes)                ///< [out] Size in bytes of \t d_temp_storage allocation
-    {
-        temp_storage_bytes = (num_tiles + TILE_STATUS_PADDING) * sizeof(TileDescriptor);       // bytes needed for tile status descriptors
-        return cudaSuccess;
-    }
-
-
-    /**
-     * Initialize (from device)
-     */
-    __device__ __forceinline__ void InitializeStatus(int num_tiles)
-    {
-        int             tile_idx    = (blockIdx.x * blockDim.x) + threadIdx.x;
-        TxnWord         val         = TxnWord();
-        TileDescriptor  *descriptor = reinterpret_cast<TileDescriptor*>(&val);
-
-        if (tile_idx < num_tiles)
-        {
-            // Not-yet-set
-            descriptor->status = StatusWord(SCAN_TILE_INVALID);
-            d_tile_descriptors[TILE_STATUS_PADDING + tile_idx] = val;
-        }
-
-        if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING))
-        {
-            // Padding
-            descriptor->status = StatusWord(SCAN_TILE_OOB);
-            d_tile_descriptors[threadIdx.x] = val;
-        }
-    }
-
-
-    /**
-     * Update the specified tile's inclusive value and corresponding status
-     */
-    __device__ __forceinline__ void SetInclusive(int tile_idx, KeyValuePairT tile_inclusive)
-    {
-        TileDescriptor tile_descriptor;
-        tile_descriptor.status  = SCAN_TILE_INCLUSIVE;
-        tile_descriptor.value   = tile_inclusive.value;
-        tile_descriptor.key     = tile_inclusive.key;
-
-        TxnWord alias;
-        *reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor;
-        ThreadStore<STORE_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias);
-    }
-
-
-    /**
-     * Update the specified tile's partial value and corresponding status
-     */
-    __device__ __forceinline__ void SetPartial(int tile_idx, KeyValuePairT tile_partial)
-    {
-        TileDescriptor tile_descriptor;
-        tile_descriptor.status  = SCAN_TILE_PARTIAL;
-        tile_descriptor.value   = tile_partial.value;
-        tile_descriptor.key     = tile_partial.key;
-
-        TxnWord alias;
-        *reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor;
-        ThreadStore<STORE_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias);
-    }
-
-    /**
-     * Wait for the corresponding tile to become non-invalid
-     */
-    __device__ __forceinline__ void WaitForValid(
-        int                     tile_idx,
-        StatusWord              &status,
-        KeyValuePairT           &value)
-    {
-//        TxnWord         alias           = ThreadLoad<LOAD_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx);
-//        TileDescriptor  tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
-//
-//        while (tile_descriptor.status == SCAN_TILE_INVALID)
-//        {
-//            __threadfence_block(); // prevent hoisting loads from loop
-//
-//            alias           = ThreadLoad<LOAD_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx);
-//            tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
-//        }
-//
-//        status      = tile_descriptor.status;
-//        value.value = tile_descriptor.value;
-//        value.key   = tile_descriptor.key;
-
-        TileDescriptor tile_descriptor;
-        do
-        {
-            __threadfence_block(); // prevent hoisting loads from loop
-            TxnWord alias = ThreadLoad<LOAD_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx);
-            tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
-
-        } while (WARP_ANY((tile_descriptor.status == SCAN_TILE_INVALID), 0xffffffff));
-
-        status      = tile_descriptor.status;
-        value.value = tile_descriptor.value;
-        value.key   = tile_descriptor.key;
-    }
-
-};
-
-
-/******************************************************************************
- * Prefix call-back operator for coupling local block scan within a
- * block-cooperative scan
- ******************************************************************************/
-
-/**
- * Stateful block-scan prefix functor.  Provides the the running prefix for
- * the current tile by using the call-back warp to wait on on
- * aggregates/prefixes from predecessor tiles to become available.
- */
-template <
-    typename    T,
-    typename    ScanOpT,
-    typename    ScanTileStateT,
-    int         PTX_ARCH = CUB_PTX_ARCH>
-struct TilePrefixCallbackOp
-{
-    // Parameterized warp reduce
-    typedef WarpReduce<T, CUB_PTX_WARP_THREADS, PTX_ARCH> WarpReduceT;
-
-    // Temporary storage type
-    struct _TempStorage
-    {
-        typename WarpReduceT::TempStorage   warp_reduce;
-        T                                   exclusive_prefix;
-        T                                   inclusive_prefix;
-        T                                   block_aggregate;
-    };
-
-    // Alias wrapper allowing temporary storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-    // Type of status word
-    typedef typename ScanTileStateT::StatusWord StatusWord;
-
-    // Fields
-    _TempStorage&               temp_storage;       ///< Reference to a warp-reduction instance
-    ScanTileStateT&             tile_status;        ///< Interface to tile status
-    ScanOpT                     scan_op;            ///< Binary scan operator
-    int                         tile_idx;           ///< The current tile index
-    T                           exclusive_prefix;   ///< Exclusive prefix for the tile
-    T                           inclusive_prefix;   ///< Inclusive prefix for the tile
-
-    // Constructor
-    __device__ __forceinline__
-    TilePrefixCallbackOp(
-        ScanTileStateT       &tile_status,
-        TempStorage         &temp_storage,
-        ScanOpT              scan_op,
-        int                 tile_idx)
-    :
-        temp_storage(temp_storage.Alias()),
-        tile_status(tile_status),
-        scan_op(scan_op),
-        tile_idx(tile_idx) {}
-
-
-    // Block until all predecessors within the warp-wide window have non-invalid status
-    __device__ __forceinline__
-    void ProcessWindow(
-        int         predecessor_idx,        ///< Preceding tile index to inspect
-        StatusWord  &predecessor_status,    ///< [out] Preceding tile status
-        T           &window_aggregate)      ///< [out] Relevant partial reduction from this window of preceding tiles
-    {
-        T value;
-        tile_status.WaitForValid(predecessor_idx, predecessor_status, value);
-
-        // Perform a segmented reduction to get the prefix for the current window.
-        // Use the swizzled scan operator because we are now scanning *down* towards thread0.
-
-        int tail_flag = (predecessor_status == StatusWord(SCAN_TILE_INCLUSIVE));
-        window_aggregate = WarpReduceT(temp_storage.warp_reduce).TailSegmentedReduce(
-            value,
-            tail_flag,
-            SwizzleScanOp<ScanOpT>(scan_op));
-    }
-
-
-    // BlockScan prefix callback functor (called by the first warp)
-    __device__ __forceinline__
-    T operator()(T block_aggregate)
-    {
-
-        // Update our status with our tile-aggregate
-        if (threadIdx.x == 0)
-        {
-            temp_storage.block_aggregate = block_aggregate;
-            tile_status.SetPartial(tile_idx, block_aggregate);
-        }
-
-        int         predecessor_idx = tile_idx - threadIdx.x - 1;
-        StatusWord  predecessor_status;
-        T           window_aggregate;
-
-        // Wait for the warp-wide window of predecessor tiles to become valid
-        ProcessWindow(predecessor_idx, predecessor_status, window_aggregate);
-
-        // The exclusive tile prefix starts out as the current window aggregate
-        exclusive_prefix = window_aggregate;
-
-        // Keep sliding the window back until we come across a tile whose inclusive prefix is known
-        while (WARP_ALL((predecessor_status != StatusWord(SCAN_TILE_INCLUSIVE)), 0xffffffff))
-        {
-            predecessor_idx -= CUB_PTX_WARP_THREADS;
-
-            // Update exclusive tile prefix with the window prefix
-            ProcessWindow(predecessor_idx, predecessor_status, window_aggregate);
-            exclusive_prefix = scan_op(window_aggregate, exclusive_prefix);
-        }
-
-        // Compute the inclusive tile prefix and update the status for this tile
-        if (threadIdx.x == 0)
-        {
-            inclusive_prefix = scan_op(exclusive_prefix, block_aggregate);
-            tile_status.SetInclusive(tile_idx, inclusive_prefix);
-
-            temp_storage.exclusive_prefix = exclusive_prefix;
-            temp_storage.inclusive_prefix = inclusive_prefix;
-        }
-
-        // Return exclusive_prefix
-        return exclusive_prefix;
-    }
-
-    // Get the exclusive prefix stored in temporary storage
-    __device__ __forceinline__
-    T GetExclusivePrefix()
-    {
-        return temp_storage.exclusive_prefix;
-    }
-
-    // Get the inclusive prefix stored in temporary storage
-    __device__ __forceinline__
-    T GetInclusivePrefix()
-    {
-        return temp_storage.inclusive_prefix;
-    }
-
-    // Get the block aggregate stored in temporary storage
-    __device__ __forceinline__
-    T GetBlockAggregate()
-    {
-        return temp_storage.block_aggregate;
-    }
-
-};
-
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/block/block_adjacent_difference.cuh b/thrust/system/cuda/detail/cub/block/block_adjacent_difference.cuh
deleted file mode 100644
index dae1f3018..000000000
--- a/thrust/system/cuda/detail/cub/block/block_adjacent_difference.cuh
+++ /dev/null
@@ -1,596 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * The cub::BlockDiscontinuity class provides [<em>collective</em>](index.html#sec0) methods for flagging discontinuities within an ordered set of items partitioned across a CUDA thread block.
- */
-
-#pragma once
-
-#include "../util_type.cuh"
-#include "../util_ptx.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-template <
-    typename    T,
-    int         BLOCK_DIM_X,
-    int         BLOCK_DIM_Y     = 1,
-    int         BLOCK_DIM_Z     = 1,
-    int         PTX_ARCH        = CUB_PTX_ARCH>
-class BlockAdjacentDifference
-{
-private:
-
-    /******************************************************************************
-     * Constants and type definitions
-     ******************************************************************************/
-
-    /// Constants
-    enum
-    {
-        /// The thread block size in threads
-        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-    };
-
-
-    /// Shared memory storage layout type (last element from each thread's input)
-    struct _TempStorage
-    {
-        T first_items[BLOCK_THREADS];
-        T last_items[BLOCK_THREADS];
-    };
-
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-    /// Internal storage allocator
-    __device__ __forceinline__ _TempStorage& PrivateStorage()
-    {
-        __shared__ _TempStorage private_storage;
-        return private_storage;
-    }
-
-
-    /// Specialization for when FlagOp has third index param
-    template <typename FlagOp, bool HAS_PARAM = BinaryOpHasIdxParam<T, FlagOp>::HAS_PARAM>
-    struct ApplyOp
-    {
-        // Apply flag operator
-        static __device__ __forceinline__ T FlagT(FlagOp flag_op, const T &a, const T &b, int idx)
-        {
-            return flag_op(b, a, idx);
-        }
-    };
-
-    /// Specialization for when FlagOp does not have a third index param
-    template <typename FlagOp>
-    struct ApplyOp<FlagOp, false>
-    {
-        // Apply flag operator
-        static __device__ __forceinline__ T FlagT(FlagOp flag_op, const T &a, const T &b, int /*idx*/)
-        {
-            return flag_op(b, a);
-        }
-    };
-
-    /// Templated unrolling of item comparison (inductive case)
-    template <int ITERATION, int MAX_ITERATIONS>
-    struct Iterate
-    {
-        // Head flags
-        template <
-            int             ITEMS_PER_THREAD,
-            typename        FlagT,
-            typename        FlagOp>
-        static __device__ __forceinline__ void FlagHeads(
-            int                     linear_tid,
-            FlagT                   (&flags)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
-            T                       (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-            T                       (&preds)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
-            FlagOp                  flag_op)                            ///< [in] Binary boolean flag predicate
-        {
-            preds[ITERATION] = input[ITERATION - 1];
-
-            flags[ITERATION] = ApplyOp<FlagOp>::FlagT(
-                flag_op,
-                preds[ITERATION],
-                input[ITERATION],
-                (linear_tid * ITEMS_PER_THREAD) + ITERATION);
-
-            Iterate<ITERATION + 1, MAX_ITERATIONS>::FlagHeads(linear_tid, flags, input, preds, flag_op);
-        }
-
-        // Tail flags
-        template <
-            int             ITEMS_PER_THREAD,
-            typename        FlagT,
-            typename        FlagOp>
-        static __device__ __forceinline__ void FlagTails(
-            int                     linear_tid,
-            FlagT                   (&flags)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
-            T                       (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-            FlagOp                  flag_op)                            ///< [in] Binary boolean flag predicate
-        {
-            flags[ITERATION] = ApplyOp<FlagOp>::FlagT(
-                flag_op,
-                input[ITERATION],
-                input[ITERATION + 1],
-                (linear_tid * ITEMS_PER_THREAD) + ITERATION + 1);
-
-            Iterate<ITERATION + 1, MAX_ITERATIONS>::FlagTails(linear_tid, flags, input, flag_op);
-        }
-
-    };
-
-    /// Templated unrolling of item comparison (termination case)
-    template <int MAX_ITERATIONS>
-    struct Iterate<MAX_ITERATIONS, MAX_ITERATIONS>
-    {
-        // Head flags
-        template <
-            int             ITEMS_PER_THREAD,
-            typename        FlagT,
-            typename        FlagOp>
-        static __device__ __forceinline__ void FlagHeads(
-            int                     /*linear_tid*/,
-            FlagT                   (&/*flags*/)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
-            T                       (&/*input*/)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-            T                       (&/*preds*/)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
-            FlagOp                  /*flag_op*/)                            ///< [in] Binary boolean flag predicate
-        {}
-
-        // Tail flags
-        template <
-            int             ITEMS_PER_THREAD,
-            typename        FlagT,
-            typename        FlagOp>
-        static __device__ __forceinline__ void FlagTails(
-            int                     /*linear_tid*/,
-            FlagT                   (&/*flags*/)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
-            T                       (&/*input*/)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-            FlagOp                  /*flag_op*/)                            ///< [in] Binary boolean flag predicate
-        {}
-    };
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    /// Shared storage reference
-    _TempStorage &temp_storage;
-
-    /// Linear thread-id
-    unsigned int linear_tid;
-
-
-public:
-
-    /// \smemstorage{BlockDiscontinuity}
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    /******************************************************************//**
-     * \name Collective constructors
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
-     */
-    __device__ __forceinline__ BlockAdjacentDifference()
-    :
-        temp_storage(PrivateStorage()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.
-     */
-    __device__ __forceinline__ BlockAdjacentDifference(
-        TempStorage &temp_storage)  ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Head flag operations
-     *********************************************************************/
-    //@{
-
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagHeads(
-        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        T               (&preds)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
-        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
-    {
-        // Share last item
-        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
-
-        CTA_SYNC();
-
-        if (linear_tid == 0)
-        {
-            // Set flag for first thread-item (preds[0] is undefined)
-            head_flags[0] = 1;
-        }
-        else
-        {
-            preds[0] = temp_storage.last_items[linear_tid - 1];
-            head_flags[0] = ApplyOp<FlagOp>::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD);
-        }
-
-        // Set head_flags for remaining items
-        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
-    }
-
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagHeads(
-        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        T               (&preds)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
-        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
-        T               tile_predecessor_item)              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
-    {
-        // Share last item
-        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
-
-        CTA_SYNC();
-
-        // Set flag for first thread-item
-        preds[0] = (linear_tid == 0) ?
-            tile_predecessor_item :              // First thread
-            temp_storage.last_items[linear_tid - 1];
-
-        head_flags[0] = ApplyOp<FlagOp>::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD);
-
-        // Set head_flags for remaining items
-        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
-    }
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagHeads(
-        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
-    {
-        T preds[ITEMS_PER_THREAD];
-        FlagHeads(head_flags, input, preds, flag_op);
-    }
-
-
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagHeads(
-        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
-        T               tile_predecessor_item)              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
-    {
-        T preds[ITEMS_PER_THREAD];
-        FlagHeads(head_flags, input, preds, flag_op, tile_predecessor_item);
-    }
-
-
-
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagTails(
-        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
-    {
-        // Share first item
-        temp_storage.first_items[linear_tid] = input[0];
-
-        CTA_SYNC();
-
-        // Set flag for last thread-item
-        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
-            1 :                             // Last thread
-            ApplyOp<FlagOp>::FlagT(
-                flag_op,
-                input[ITEMS_PER_THREAD - 1],
-                temp_storage.first_items[linear_tid + 1],
-                (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
-
-        // Set tail_flags for remaining items
-        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
-    }
-
-
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagTails(
-        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
-        T               tile_successor_item)                ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
-    {
-        // Share first item
-        temp_storage.first_items[linear_tid] = input[0];
-
-        CTA_SYNC();
-
-        // Set flag for last thread-item
-        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
-            tile_successor_item :              // Last thread
-            temp_storage.first_items[linear_tid + 1];
-
-        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
-            flag_op,
-            input[ITEMS_PER_THREAD - 1],
-            successor_item,
-            (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
-
-        // Set tail_flags for remaining items
-        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
-    }
-
-
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagHeadsAndTails(
-        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
-        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
-    {
-        // Share first and last items
-        temp_storage.first_items[linear_tid] = input[0];
-        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
-
-        CTA_SYNC();
-
-        T preds[ITEMS_PER_THREAD];
-
-        // Set flag for first thread-item
-        preds[0] = temp_storage.last_items[linear_tid - 1];
-        if (linear_tid == 0)
-        {
-            head_flags[0] = 1;
-        }
-        else
-        {
-            head_flags[0] = ApplyOp<FlagOp>::FlagT(
-                flag_op,
-                preds[0],
-                input[0],
-                linear_tid * ITEMS_PER_THREAD);
-        }
-
-
-        // Set flag for last thread-item
-        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
-            1 :                             // Last thread
-            ApplyOp<FlagOp>::FlagT(
-                flag_op,
-                input[ITEMS_PER_THREAD - 1],
-                temp_storage.first_items[linear_tid + 1],
-                (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
-
-        // Set head_flags for remaining items
-        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
-
-        // Set tail_flags for remaining items
-        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
-    }
-
-
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagHeadsAndTails(
-        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
-        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
-        T               tile_successor_item,                ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
-    {
-        // Share first and last items
-        temp_storage.first_items[linear_tid] = input[0];
-        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
-
-        CTA_SYNC();
-
-        T preds[ITEMS_PER_THREAD];
-
-        // Set flag for first thread-item
-        if (linear_tid == 0)
-        {
-            head_flags[0] = 1;
-        }
-        else
-        {
-            preds[0] = temp_storage.last_items[linear_tid - 1];
-            head_flags[0] = ApplyOp<FlagOp>::FlagT(
-                flag_op,
-                preds[0],
-                input[0],
-                linear_tid * ITEMS_PER_THREAD);
-        }
-
-        // Set flag for last thread-item
-        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
-            tile_successor_item :              // Last thread
-            temp_storage.first_items[linear_tid + 1];
-
-        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
-            flag_op,
-            input[ITEMS_PER_THREAD - 1],
-            successor_item,
-            (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
-
-        // Set head_flags for remaining items
-        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
-
-        // Set tail_flags for remaining items
-        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
-    }
-
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagHeadsAndTails(
-        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
-        T               tile_predecessor_item,              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
-        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
-    {
-        // Share first and last items
-        temp_storage.first_items[linear_tid] = input[0];
-        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
-
-        CTA_SYNC();
-
-        T preds[ITEMS_PER_THREAD];
-
-        // Set flag for first thread-item
-        preds[0] = (linear_tid == 0) ?
-            tile_predecessor_item :              // First thread
-            temp_storage.last_items[linear_tid - 1];
-
-        head_flags[0] = ApplyOp<FlagOp>::FlagT(
-            flag_op,
-            preds[0],
-            input[0],
-            linear_tid * ITEMS_PER_THREAD);
-
-        // Set flag for last thread-item
-        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
-            1 :                             // Last thread
-            ApplyOp<FlagOp>::FlagT(
-                flag_op,
-                input[ITEMS_PER_THREAD - 1],
-                temp_storage.first_items[linear_tid + 1],
-                (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
-
-        // Set head_flags for remaining items
-        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
-
-        // Set tail_flags for remaining items
-        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
-    }
-
-
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagHeadsAndTails(
-        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
-        T               tile_predecessor_item,              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
-        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
-        T               tile_successor_item,                ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
-    {
-        // Share first and last items
-        temp_storage.first_items[linear_tid] = input[0];
-        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
-
-        CTA_SYNC();
-
-        T preds[ITEMS_PER_THREAD];
-
-        // Set flag for first thread-item
-        preds[0] = (linear_tid == 0) ?
-            tile_predecessor_item :              // First thread
-            temp_storage.last_items[linear_tid - 1];
-
-        head_flags[0] = ApplyOp<FlagOp>::FlagT(
-            flag_op,
-            preds[0],
-            input[0],
-            linear_tid * ITEMS_PER_THREAD);
-
-        // Set flag for last thread-item
-        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
-            tile_successor_item :              // Last thread
-            temp_storage.first_items[linear_tid + 1];
-
-        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
-            flag_op,
-            input[ITEMS_PER_THREAD - 1],
-            successor_item,
-            (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
-
-        // Set head_flags for remaining items
-        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
-
-        // Set tail_flags for remaining items
-        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
-    }
-
-
-
-};
-
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/system/cuda/detail/cub/block/block_discontinuity.cuh b/thrust/system/cuda/detail/cub/block/block_discontinuity.cuh
deleted file mode 100644
index f43ee39ee..000000000
--- a/thrust/system/cuda/detail/cub/block/block_discontinuity.cuh
+++ /dev/null
@@ -1,1148 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * The cub::BlockDiscontinuity class provides [<em>collective</em>](index.html#sec0) methods for flagging discontinuities within an ordered set of items partitioned across a CUDA thread block.
- */
-
-#pragma once
-
-#include "../util_type.cuh"
-#include "../util_ptx.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \brief The BlockDiscontinuity class provides [<em>collective</em>](index.html#sec0) methods for flagging discontinuities within an ordered set of items partitioned across a CUDA thread block. ![](discont_logo.png)
- * \ingroup BlockModule
- *
- * \tparam T                The data type to be flagged.
- * \tparam BLOCK_DIM_X      The thread block length in threads along the X dimension
- * \tparam BLOCK_DIM_Y      <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
- * \tparam BLOCK_DIM_Z      <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- * \tparam PTX_ARCH         <b>[optional]</b> \ptxversion
- *
- * \par Overview
- * - A set of "head flags" (or "tail flags") is often used to indicate corresponding items
- *   that differ from their predecessors (or successors).  For example, head flags are convenient
- *   for demarcating disjoint data segments as part of a segmented scan or reduction.
- * - \blocked
- *
- * \par Performance Considerations
- * - \granularity
- *
- * \par A Simple Example
- * \blockcollective{BlockDiscontinuity}
- * \par
- * The code snippet below illustrates the head flagging of 512 integer items that
- * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
- * where each thread owns 4 consecutive items.
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
- *
- * __global__ void ExampleKernel(...)
- * {
- *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
- *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
- *
- *     // Allocate shared memory for BlockDiscontinuity
- *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
- *
- *     // Obtain a segment of consecutive items that are blocked across threads
- *     int thread_data[4];
- *     ...
- *
- *     // Collectively compute head flags for discontinuities in the segment
- *     int head_flags[4];
- *     BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality());
- *
- * \endcode
- * \par
- * Suppose the set of input \p thread_data across the block of threads is
- * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }</tt>.
- * The corresponding output \p head_flags in those threads will be
- * <tt>{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
- *
- * \par Performance Considerations
- * - Incurs zero bank conflicts for most types
- *
- */
-template <
-    typename    T,
-    int         BLOCK_DIM_X,
-    int         BLOCK_DIM_Y     = 1,
-    int         BLOCK_DIM_Z     = 1,
-    int         PTX_ARCH        = CUB_PTX_ARCH>
-class BlockDiscontinuity
-{
-private:
-
-    /******************************************************************************
-     * Constants and type definitions
-     ******************************************************************************/
-
-    /// Constants
-    enum
-    {
-        /// The thread block size in threads
-        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-    };
-
-
-    /// Shared memory storage layout type (last element from each thread's input)
-    struct _TempStorage
-    {
-        T first_items[BLOCK_THREADS];
-        T last_items[BLOCK_THREADS];
-    };
-
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-    /// Internal storage allocator
-    __device__ __forceinline__ _TempStorage& PrivateStorage()
-    {
-        __shared__ _TempStorage private_storage;
-        return private_storage;
-    }
-
-
-    /// Specialization for when FlagOp has third index param
-    template <typename FlagOp, bool HAS_PARAM = BinaryOpHasIdxParam<T, FlagOp>::HAS_PARAM>
-    struct ApplyOp
-    {
-        // Apply flag operator
-        static __device__ __forceinline__ bool FlagT(FlagOp flag_op, const T &a, const T &b, int idx)
-        {
-            return flag_op(a, b, idx);
-        }
-    };
-
-    /// Specialization for when FlagOp does not have a third index param
-    template <typename FlagOp>
-    struct ApplyOp<FlagOp, false>
-    {
-        // Apply flag operator
-        static __device__ __forceinline__ bool FlagT(FlagOp flag_op, const T &a, const T &b, int /*idx*/)
-        {
-            return flag_op(a, b);
-        }
-    };
-
-    /// Templated unrolling of item comparison (inductive case)
-    template <int ITERATION, int MAX_ITERATIONS>
-    struct Iterate
-    {
-        // Head flags
-        template <
-            int             ITEMS_PER_THREAD,
-            typename        FlagT,
-            typename        FlagOp>
-        static __device__ __forceinline__ void FlagHeads(
-            int                     linear_tid,
-            FlagT                   (&flags)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
-            T                       (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-            T                       (&preds)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
-            FlagOp                  flag_op)                            ///< [in] Binary boolean flag predicate
-        {
-            preds[ITERATION] = input[ITERATION - 1];
-
-            flags[ITERATION] = ApplyOp<FlagOp>::FlagT(
-                flag_op,
-                preds[ITERATION],
-                input[ITERATION],
-                (linear_tid * ITEMS_PER_THREAD) + ITERATION);
-
-            Iterate<ITERATION + 1, MAX_ITERATIONS>::FlagHeads(linear_tid, flags, input, preds, flag_op);
-        }
-
-        // Tail flags
-        template <
-            int             ITEMS_PER_THREAD,
-            typename        FlagT,
-            typename        FlagOp>
-        static __device__ __forceinline__ void FlagTails(
-            int                     linear_tid,
-            FlagT                   (&flags)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
-            T                       (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-            FlagOp                  flag_op)                            ///< [in] Binary boolean flag predicate
-        {
-            flags[ITERATION] = ApplyOp<FlagOp>::FlagT(
-                flag_op,
-                input[ITERATION],
-                input[ITERATION + 1],
-                (linear_tid * ITEMS_PER_THREAD) + ITERATION + 1);
-
-            Iterate<ITERATION + 1, MAX_ITERATIONS>::FlagTails(linear_tid, flags, input, flag_op);
-        }
-
-    };
-
-    /// Templated unrolling of item comparison (termination case)
-    template <int MAX_ITERATIONS>
-    struct Iterate<MAX_ITERATIONS, MAX_ITERATIONS>
-    {
-        // Head flags
-        template <
-            int             ITEMS_PER_THREAD,
-            typename        FlagT,
-            typename        FlagOp>
-        static __device__ __forceinline__ void FlagHeads(
-            int                     /*linear_tid*/,
-            FlagT                   (&/*flags*/)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
-            T                       (&/*input*/)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-            T                       (&/*preds*/)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
-            FlagOp                  /*flag_op*/)                            ///< [in] Binary boolean flag predicate
-        {}
-
-        // Tail flags
-        template <
-            int             ITEMS_PER_THREAD,
-            typename        FlagT,
-            typename        FlagOp>
-        static __device__ __forceinline__ void FlagTails(
-            int                     /*linear_tid*/,
-            FlagT                   (&/*flags*/)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
-            T                       (&/*input*/)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-            FlagOp                  /*flag_op*/)                            ///< [in] Binary boolean flag predicate
-        {}
-    };
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    /// Shared storage reference
-    _TempStorage &temp_storage;
-
-    /// Linear thread-id
-    unsigned int linear_tid;
-
-
-public:
-
-    /// \smemstorage{BlockDiscontinuity}
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    /******************************************************************//**
-     * \name Collective constructors
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
-     */
-    __device__ __forceinline__ BlockDiscontinuity()
-    :
-        temp_storage(PrivateStorage()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.
-     */
-    __device__ __forceinline__ BlockDiscontinuity(
-        TempStorage &temp_storage)  ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Head flag operations
-     *********************************************************************/
-    //@{
-
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagHeads(
-        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        T               (&preds)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
-        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
-    {
-        // Share last item
-        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
-
-        CTA_SYNC();
-
-        if (linear_tid == 0)
-        {
-            // Set flag for first thread-item (preds[0] is undefined)
-            head_flags[0] = 1;
-        }
-        else
-        {
-            preds[0] = temp_storage.last_items[linear_tid - 1];
-            head_flags[0] = ApplyOp<FlagOp>::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD);
-        }
-
-        // Set head_flags for remaining items
-        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
-    }
-
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagHeads(
-        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        T               (&preds)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
-        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
-        T               tile_predecessor_item)              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
-    {
-        // Share last item
-        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
-
-        CTA_SYNC();
-
-        // Set flag for first thread-item
-        preds[0] = (linear_tid == 0) ?
-            tile_predecessor_item :              // First thread
-            temp_storage.last_items[linear_tid - 1];
-
-        head_flags[0] = ApplyOp<FlagOp>::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD);
-
-        // Set head_flags for remaining items
-        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
-    }
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-    /**
-     * \brief Sets head flags indicating discontinuities between items partitioned across the thread block, for which the first item has no reference and is always flagged.
-     *
-     * \par
-     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
-     *   <tt>input<sub><em>i</em></sub></tt> when
-     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
-     *   returns \p true (where <em>previous-item</em> is either the preceding item
-     *   in the same thread or the last item in the previous thread).
-     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is always flagged.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates the head-flagging of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
-     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
-     *
-     *     // Allocate shared memory for BlockDiscontinuity
-     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively compute head flags for discontinuities in the segment
-     *     int head_flags[4];
-     *     BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is
-     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }</tt>.
-     * The corresponding output \p head_flags in those threads will be
-     * <tt>{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
-     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagHeads(
-        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
-    {
-        T preds[ITEMS_PER_THREAD];
-        FlagHeads(head_flags, input, preds, flag_op);
-    }
-
-
-    /**
-     * \brief Sets head flags indicating discontinuities between items partitioned across the thread block.
-     *
-     * \par
-     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
-     *   <tt>input<sub><em>i</em></sub></tt> when
-     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
-     *   returns \p true (where <em>previous-item</em> is either the preceding item
-     *   in the same thread or the last item in the previous thread).
-     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is compared
-     *   against \p tile_predecessor_item.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates the head-flagging of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
-     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
-     *
-     *     // Allocate shared memory for BlockDiscontinuity
-     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Have thread0 obtain the predecessor item for the entire tile
-     *     int tile_predecessor_item;
-     *     if (threadIdx.x == 0) tile_predecessor_item == ...
-     *
-     *     // Collectively compute head flags for discontinuities in the segment
-     *     int head_flags[4];
-     *     BlockDiscontinuity(temp_storage).FlagHeads(
-     *         head_flags, thread_data, cub::Inequality(), tile_predecessor_item);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is
-     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }</tt>,
-     * and that \p tile_predecessor_item is \p 0.  The corresponding output \p head_flags in those threads will be
-     * <tt>{ [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
-     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagHeads(
-        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
-        T               tile_predecessor_item)              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
-    {
-        T preds[ITEMS_PER_THREAD];
-        FlagHeads(head_flags, input, preds, flag_op, tile_predecessor_item);
-    }
-
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Tail flag operations
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Sets tail flags indicating discontinuities between items partitioned across the thread block, for which the last item has no reference and is always flagged.
-     *
-     * \par
-     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
-     *   <tt>input<sub><em>i</em></sub></tt> when
-     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
-     *   returns \p true (where <em>next-item</em> is either the next item
-     *   in the same thread or the first item in the next thread).
-     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
-     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is always flagged.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates the tail-flagging of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
-     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
-     *
-     *     // Allocate shared memory for BlockDiscontinuity
-     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively compute tail flags for discontinuities in the segment
-     *     int tail_flags[4];
-     *     BlockDiscontinuity(temp_storage).FlagTails(tail_flags, thread_data, cub::Inequality());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is
-     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>.
-     * The corresponding output \p tail_flags in those threads will be
-     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }</tt>.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
-     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagTails(
-        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
-    {
-        // Share first item
-        temp_storage.first_items[linear_tid] = input[0];
-
-        CTA_SYNC();
-
-        // Set flag for last thread-item
-        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
-            1 :                             // Last thread
-            ApplyOp<FlagOp>::FlagT(
-                flag_op,
-                input[ITEMS_PER_THREAD - 1],
-                temp_storage.first_items[linear_tid + 1],
-                (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
-
-        // Set tail_flags for remaining items
-        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
-    }
-
-
-    /**
-     * \brief Sets tail flags indicating discontinuities between items partitioned across the thread block.
-     *
-     * \par
-     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
-     *   <tt>input<sub><em>i</em></sub></tt> when
-     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
-     *   returns \p true (where <em>next-item</em> is either the next item
-     *   in the same thread or the first item in the next thread).
-     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
-     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is compared
-     *   against \p tile_successor_item.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates the tail-flagging of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
-     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
-     *
-     *     // Allocate shared memory for BlockDiscontinuity
-     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Have thread127 obtain the successor item for the entire tile
-     *     int tile_successor_item;
-     *     if (threadIdx.x == 127) tile_successor_item == ...
-     *
-     *     // Collectively compute tail flags for discontinuities in the segment
-     *     int tail_flags[4];
-     *     BlockDiscontinuity(temp_storage).FlagTails(
-     *         tail_flags, thread_data, cub::Inequality(), tile_successor_item);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is
-     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>
-     * and that \p tile_successor_item is \p 125.  The corresponding output \p tail_flags in those threads will be
-     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }</tt>.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
-     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagTails(
-        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
-        T               tile_successor_item)                ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
-    {
-        // Share first item
-        temp_storage.first_items[linear_tid] = input[0];
-
-        CTA_SYNC();
-
-        // Set flag for last thread-item
-        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
-            tile_successor_item :              // Last thread
-            temp_storage.first_items[linear_tid + 1];
-
-        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
-            flag_op,
-            input[ITEMS_PER_THREAD - 1],
-            successor_item,
-            (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
-
-        // Set tail_flags for remaining items
-        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Head & tail flag operations
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
-     *
-     * \par
-     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
-     *   <tt>input<sub><em>i</em></sub></tt> when
-     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
-     *   returns \p true (where <em>previous-item</em> is either the preceding item
-     *   in the same thread or the last item in the previous thread).
-     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is always flagged.
-     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
-     *   <tt>input<sub><em>i</em></sub></tt> when
-     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
-     *   returns \p true (where <em>next-item</em> is either the next item
-     *   in the same thread or the first item in the next thread).
-     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
-     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is always flagged.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates the head- and tail-flagging of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
-     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
-     *
-     *     // Allocate shared memory for BlockDiscontinuity
-     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively compute head and flags for discontinuities in the segment
-     *     int head_flags[4];
-     *     int tail_flags[4];
-     *     BlockDiscontinuity(temp_storage).FlagTails(
-     *         head_flags, tail_flags, thread_data, cub::Inequality());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is
-     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>
-     * and that the tile_successor_item is \p 125.  The corresponding output \p head_flags
-     * in those threads will be <tt>{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
-     * and the corresponding output \p tail_flags in those threads will be
-     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }</tt>.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
-     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagHeadsAndTails(
-        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
-        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
-    {
-        // Share first and last items
-        temp_storage.first_items[linear_tid] = input[0];
-        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
-
-        CTA_SYNC();
-
-        T preds[ITEMS_PER_THREAD];
-
-        // Set flag for first thread-item
-        preds[0] = temp_storage.last_items[linear_tid - 1];
-        if (linear_tid == 0)
-        {
-            head_flags[0] = 1;
-        }
-        else
-        {
-            head_flags[0] = ApplyOp<FlagOp>::FlagT(
-                flag_op,
-                preds[0],
-                input[0],
-                linear_tid * ITEMS_PER_THREAD);
-        }
-
-
-        // Set flag for last thread-item
-        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
-            1 :                             // Last thread
-            ApplyOp<FlagOp>::FlagT(
-                flag_op,
-                input[ITEMS_PER_THREAD - 1],
-                temp_storage.first_items[linear_tid + 1],
-                (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
-
-        // Set head_flags for remaining items
-        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
-
-        // Set tail_flags for remaining items
-        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
-    }
-
-
-    /**
-     * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
-     *
-     * \par
-     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
-     *   <tt>input<sub><em>i</em></sub></tt> when
-     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
-     *   returns \p true (where <em>previous-item</em> is either the preceding item
-     *   in the same thread or the last item in the previous thread).
-     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is always flagged.
-     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
-     *   <tt>input<sub><em>i</em></sub></tt> when
-     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
-     *   returns \p true (where <em>next-item</em> is either the next item
-     *   in the same thread or the first item in the next thread).
-     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
-     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is compared
-     *   against \p tile_predecessor_item.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates the head- and tail-flagging of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
-     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
-     *
-     *     // Allocate shared memory for BlockDiscontinuity
-     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Have thread127 obtain the successor item for the entire tile
-     *     int tile_successor_item;
-     *     if (threadIdx.x == 127) tile_successor_item == ...
-     *
-     *     // Collectively compute head and flags for discontinuities in the segment
-     *     int head_flags[4];
-     *     int tail_flags[4];
-     *     BlockDiscontinuity(temp_storage).FlagTails(
-     *         head_flags, tail_flags, tile_successor_item, thread_data, cub::Inequality());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is
-     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>
-     * and that the tile_successor_item is \p 125.  The corresponding output \p head_flags
-     * in those threads will be <tt>{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
-     * and the corresponding output \p tail_flags in those threads will be
-     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }</tt>.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
-     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagHeadsAndTails(
-        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
-        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
-        T               tile_successor_item,                ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
-    {
-        // Share first and last items
-        temp_storage.first_items[linear_tid] = input[0];
-        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
-
-        CTA_SYNC();
-
-        T preds[ITEMS_PER_THREAD];
-
-        // Set flag for first thread-item
-        if (linear_tid == 0)
-        {
-            head_flags[0] = 1;
-        }
-        else
-        {
-            preds[0] = temp_storage.last_items[linear_tid - 1];
-            head_flags[0] = ApplyOp<FlagOp>::FlagT(
-                flag_op,
-                preds[0],
-                input[0],
-                linear_tid * ITEMS_PER_THREAD);
-        }
-
-        // Set flag for last thread-item
-        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
-            tile_successor_item :              // Last thread
-            temp_storage.first_items[linear_tid + 1];
-
-        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
-            flag_op,
-            input[ITEMS_PER_THREAD - 1],
-            successor_item,
-            (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
-
-        // Set head_flags for remaining items
-        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
-
-        // Set tail_flags for remaining items
-        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
-    }
-
-
-    /**
-     * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
-     *
-     * \par
-     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
-     *   <tt>input<sub><em>i</em></sub></tt> when
-     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
-     *   returns \p true (where <em>previous-item</em> is either the preceding item
-     *   in the same thread or the last item in the previous thread).
-     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is compared
-     *   against \p tile_predecessor_item.
-     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
-     *   <tt>input<sub><em>i</em></sub></tt> when
-     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
-     *   returns \p true (where <em>next-item</em> is either the next item
-     *   in the same thread or the first item in the next thread).
-     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
-     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is always flagged.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates the head- and tail-flagging of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
-     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
-     *
-     *     // Allocate shared memory for BlockDiscontinuity
-     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Have thread0 obtain the predecessor item for the entire tile
-     *     int tile_predecessor_item;
-     *     if (threadIdx.x == 0) tile_predecessor_item == ...
-     *
-     *     // Have thread127 obtain the successor item for the entire tile
-     *     int tile_successor_item;
-     *     if (threadIdx.x == 127) tile_successor_item == ...
-     *
-     *     // Collectively compute head and flags for discontinuities in the segment
-     *     int head_flags[4];
-     *     int tail_flags[4];
-     *     BlockDiscontinuity(temp_storage).FlagTails(
-     *         head_flags, tile_predecessor_item, tail_flags, tile_successor_item,
-     *         thread_data, cub::Inequality());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is
-     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>,
-     * that the \p tile_predecessor_item is \p 0, and that the
-     * \p tile_successor_item is \p 125.  The corresponding output \p head_flags
-     * in those threads will be <tt>{ [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
-     * and the corresponding output \p tail_flags in those threads will be
-     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }</tt>.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
-     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagHeadsAndTails(
-        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
-        T               tile_predecessor_item,              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
-        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
-    {
-        // Share first and last items
-        temp_storage.first_items[linear_tid] = input[0];
-        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
-
-        CTA_SYNC();
-
-        T preds[ITEMS_PER_THREAD];
-
-        // Set flag for first thread-item
-        preds[0] = (linear_tid == 0) ?
-            tile_predecessor_item :              // First thread
-            temp_storage.last_items[linear_tid - 1];
-
-        head_flags[0] = ApplyOp<FlagOp>::FlagT(
-            flag_op,
-            preds[0],
-            input[0],
-            linear_tid * ITEMS_PER_THREAD);
-
-        // Set flag for last thread-item
-        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
-            1 :                             // Last thread
-            ApplyOp<FlagOp>::FlagT(
-                flag_op,
-                input[ITEMS_PER_THREAD - 1],
-                temp_storage.first_items[linear_tid + 1],
-                (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
-
-        // Set head_flags for remaining items
-        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
-
-        // Set tail_flags for remaining items
-        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
-    }
-
-
-    /**
-     * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
-     *
-     * \par
-     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
-     *   <tt>input<sub><em>i</em></sub></tt> when
-     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
-     *   returns \p true (where <em>previous-item</em> is either the preceding item
-     *   in the same thread or the last item in the previous thread).
-     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is compared
-     *   against \p tile_predecessor_item.
-     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
-     *   <tt>input<sub><em>i</em></sub></tt> when
-     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
-     *   returns \p true (where <em>next-item</em> is either the next item
-     *   in the same thread or the first item in the next thread).
-     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
-     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is compared
-     *   against \p tile_successor_item.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates the head- and tail-flagging of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
-     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
-     *
-     *     // Allocate shared memory for BlockDiscontinuity
-     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Have thread0 obtain the predecessor item for the entire tile
-     *     int tile_predecessor_item;
-     *     if (threadIdx.x == 0) tile_predecessor_item == ...
-     *
-     *     // Have thread127 obtain the successor item for the entire tile
-     *     int tile_successor_item;
-     *     if (threadIdx.x == 127) tile_successor_item == ...
-     *
-     *     // Collectively compute head and flags for discontinuities in the segment
-     *     int head_flags[4];
-     *     int tail_flags[4];
-     *     BlockDiscontinuity(temp_storage).FlagTails(
-     *         head_flags, tile_predecessor_item, tail_flags, tile_successor_item,
-     *         thread_data, cub::Inequality());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is
-     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>,
-     * that the \p tile_predecessor_item is \p 0, and that the
-     * \p tile_successor_item is \p 125.  The corresponding output \p head_flags
-     * in those threads will be <tt>{ [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
-     * and the corresponding output \p tail_flags in those threads will be
-     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }</tt>.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
-     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagHeadsAndTails(
-        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
-        T               tile_predecessor_item,              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
-        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
-        T               tile_successor_item,                ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
-    {
-        // Share first and last items
-        temp_storage.first_items[linear_tid] = input[0];
-        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
-
-        CTA_SYNC();
-
-        T preds[ITEMS_PER_THREAD];
-
-        // Set flag for first thread-item
-        preds[0] = (linear_tid == 0) ?
-            tile_predecessor_item :              // First thread
-            temp_storage.last_items[linear_tid - 1];
-
-        head_flags[0] = ApplyOp<FlagOp>::FlagT(
-            flag_op,
-            preds[0],
-            input[0],
-            linear_tid * ITEMS_PER_THREAD);
-
-        // Set flag for last thread-item
-        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
-            tile_successor_item :              // Last thread
-            temp_storage.first_items[linear_tid + 1];
-
-        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
-            flag_op,
-            input[ITEMS_PER_THREAD - 1],
-            successor_item,
-            (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
-
-        // Set head_flags for remaining items
-        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
-
-        // Set tail_flags for remaining items
-        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
-    }
-
-
-
-
-    //@}  end member group
-
-};
-
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/system/cuda/detail/cub/block/block_exchange.cuh b/thrust/system/cuda/detail/cub/block/block_exchange.cuh
deleted file mode 100644
index 7cc8c5abb..000000000
--- a/thrust/system/cuda/detail/cub/block/block_exchange.cuh
+++ /dev/null
@@ -1,1248 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * The cub::BlockExchange class provides [<em>collective</em>](index.html#sec0) methods for rearranging data partitioned across a CUDA thread block.
- */
-
-#pragma once
-
-#include "../util_ptx.cuh"
-#include "../util_arch.cuh"
-#include "../util_macro.cuh"
-#include "../util_type.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \brief The BlockExchange class provides [<em>collective</em>](index.html#sec0) methods for rearranging data partitioned across a CUDA thread block. ![](transpose_logo.png)
- * \ingroup BlockModule
- *
- * \tparam T                    The data type to be exchanged.
- * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
- * \tparam ITEMS_PER_THREAD     The number of items partitioned onto each thread.
- * \tparam WARP_TIME_SLICING    <b>[optional]</b> When \p true, only use enough shared memory for a single warp's worth of tile data, time-slicing the block-wide exchange over multiple synchronized rounds.  Yields a smaller memory footprint at the expense of decreased parallelism.  (Default: false)
- * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
- * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
- *
- * \par Overview
- * - It is commonplace for blocks of threads to rearrange data items between
- *   threads.  For example, the device-accessible memory subsystem prefers access patterns
- *   where data items are "striped" across threads (where consecutive threads access consecutive items),
- *   yet most block-wide operations prefer a "blocked" partitioning of items across threads
- *   (where consecutive items belong to a single thread).
- * - BlockExchange supports the following types of data exchanges:
- *   - Transposing between [<em>blocked</em>](index.html#sec5sec3) and [<em>striped</em>](index.html#sec5sec3) arrangements
- *   - Transposing between [<em>blocked</em>](index.html#sec5sec3) and [<em>warp-striped</em>](index.html#sec5sec3) arrangements
- *   - Scattering ranked items to a [<em>blocked arrangement</em>](index.html#sec5sec3)
- *   - Scattering ranked items to a [<em>striped arrangement</em>](index.html#sec5sec3)
- * - \rowmajor
- *
- * \par A Simple Example
- * \blockcollective{BlockExchange}
- * \par
- * The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement
- * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
- *
- * __global__ void ExampleKernel(int *d_data, ...)
- * {
- *     // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
- *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
- *
- *     // Allocate shared memory for BlockExchange
- *     __shared__ typename BlockExchange::TempStorage temp_storage;
- *
- *     // Load a tile of data striped across threads
- *     int thread_data[4];
- *     cub::LoadDirectStriped<128>(threadIdx.x, d_data, thread_data);
- *
- *     // Collectively exchange data into a blocked arrangement across threads
- *     BlockExchange(temp_storage).StripedToBlocked(thread_data);
- *
- * \endcode
- * \par
- * Suppose the set of striped input \p thread_data across the block of threads is
- * <tt>{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }</tt>.
- * The corresponding output \p thread_data in those threads will be
- * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
- *
- * \par Performance Considerations
- * - Proper device-specific padding ensures zero bank conflicts for most types.
- *
- */
-template <
-    typename    InputT,
-    int         BLOCK_DIM_X,
-    int         ITEMS_PER_THREAD,
-    bool        WARP_TIME_SLICING   = false,
-    int         BLOCK_DIM_Y         = 1,
-    int         BLOCK_DIM_Z         = 1,
-    int         PTX_ARCH            = CUB_PTX_ARCH>
-class BlockExchange
-{
-private:
-
-    /******************************************************************************
-     * Constants
-     ******************************************************************************/
-
-    /// Constants
-    enum
-    {
-        /// The thread block size in threads
-        BLOCK_THREADS               = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-
-        LOG_WARP_THREADS            = CUB_LOG_WARP_THREADS(PTX_ARCH),
-        WARP_THREADS                = 1 << LOG_WARP_THREADS,
-        WARPS                       = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
-
-        LOG_SMEM_BANKS              = CUB_LOG_SMEM_BANKS(PTX_ARCH),
-        SMEM_BANKS                  = 1 << LOG_SMEM_BANKS,
-
-        TILE_ITEMS                  = BLOCK_THREADS * ITEMS_PER_THREAD,
-
-        TIME_SLICES                 = (WARP_TIME_SLICING) ? WARPS : 1,
-
-        TIME_SLICED_THREADS         = (WARP_TIME_SLICING) ? CUB_MIN(BLOCK_THREADS, WARP_THREADS) : BLOCK_THREADS,
-        TIME_SLICED_ITEMS           = TIME_SLICED_THREADS * ITEMS_PER_THREAD,
-
-        WARP_TIME_SLICED_THREADS    = CUB_MIN(BLOCK_THREADS, WARP_THREADS),
-        WARP_TIME_SLICED_ITEMS      = WARP_TIME_SLICED_THREADS * ITEMS_PER_THREAD,
-
-        // Insert padding to avoid bank conflicts during raking when items per thread is a power of two and > 4 (otherwise we can typically use 128b loads)
-        INSERT_PADDING              = (ITEMS_PER_THREAD > 4) && (PowerOfTwo<ITEMS_PER_THREAD>::VALUE),
-        PADDING_ITEMS               = (INSERT_PADDING) ? (TIME_SLICED_ITEMS >> LOG_SMEM_BANKS) : 0,
-    };
-
-    /******************************************************************************
-     * Type definitions
-     ******************************************************************************/
-
-    /// Shared memory storage layout type
-    struct __align__(16) _TempStorage
-    {
-        InputT buff[TIME_SLICED_ITEMS + PADDING_ITEMS];
-    };
-
-public:
-
-    /// \smemstorage{BlockExchange}
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-private:
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    /// Shared storage reference
-    _TempStorage &temp_storage;
-
-    /// Linear thread-id
-    unsigned int linear_tid;
-    unsigned int lane_id;
-    unsigned int warp_id;
-    unsigned int warp_offset;
-
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-    /// Internal storage allocator
-    __device__ __forceinline__ _TempStorage& PrivateStorage()
-    {
-        __shared__ _TempStorage private_storage;
-        return private_storage;
-    }
-
-
-    /**
-     * Transposes data items from <em>blocked</em> arrangement to <em>striped</em> arrangement.  Specialized for no timeslicing.
-     */
-    template <typename OutputT>
-    __device__ __forceinline__ void BlockedToStriped(
-        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        Int2Type<false> /*time_slicing*/)
-    {
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
-            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-            temp_storage.buff[item_offset] = input_items[ITEM];
-        }
-
-        CTA_SYNC();
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
-            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-            output_items[ITEM] = temp_storage.buff[item_offset];
-        }
-    }
-
-
-    /**
-     * Transposes data items from <em>blocked</em> arrangement to <em>striped</em> arrangement.  Specialized for warp-timeslicing.
-     */
-    template <typename OutputT>
-    __device__ __forceinline__ void BlockedToStriped(
-        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        Int2Type<true>  /*time_slicing*/)
-    {
-        InputT temp_items[ITEMS_PER_THREAD];
-
-        #pragma unroll
-        for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
-        {
-            const int SLICE_OFFSET  = SLICE * TIME_SLICED_ITEMS;
-            const int SLICE_OOB     = SLICE_OFFSET + TIME_SLICED_ITEMS;
-
-            CTA_SYNC();
-
-            if (warp_id == SLICE)
-            {
-                #pragma unroll
-                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-                {
-                    int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM;
-                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-                    temp_storage.buff[item_offset] = input_items[ITEM];
-                }
-            }
-
-            CTA_SYNC();
-
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-            {
-                // Read a strip of items
-                const int STRIP_OFFSET  = ITEM * BLOCK_THREADS;
-                const int STRIP_OOB     = STRIP_OFFSET + BLOCK_THREADS;
-
-                if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))
-                {
-                    int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET;
-                    if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
-                    {
-                        if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-                        temp_items[ITEM] = temp_storage.buff[item_offset];
-                    }
-                }
-            }
-        }
-
-        // Copy
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            output_items[ITEM] = temp_items[ITEM];
-        }
-    }
-
-
-    /**
-     * Transposes data items from <em>blocked</em> arrangement to <em>warp-striped</em> arrangement. Specialized for no timeslicing
-     */
-    template <typename OutputT>
-    __device__ __forceinline__ void BlockedToWarpStriped(
-        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        Int2Type<false> /*time_slicing*/)
-    {
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD);
-            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-            temp_storage.buff[item_offset] = input_items[ITEM];
-        }
-
-        WARP_SYNC(0xffffffff);
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
-            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-            output_items[ITEM] = temp_storage.buff[item_offset];
-        }
-    }
-
-    /**
-     * Transposes data items from <em>blocked</em> arrangement to <em>warp-striped</em> arrangement. Specialized for warp-timeslicing
-     */
-    template <typename OutputT>
-    __device__ __forceinline__ void BlockedToWarpStriped(
-        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        Int2Type<true>  /*time_slicing*/)
-    {
-        if (warp_id == 0)
-        {
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-            {
-                int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD);
-                if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-                temp_storage.buff[item_offset] = input_items[ITEM];
-            }
-
-            WARP_SYNC(0xffffffff);
-
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-            {
-                int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
-                if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-                output_items[ITEM] = temp_storage.buff[item_offset];
-            }
-        }
-
-        #pragma unroll
-        for (unsigned int SLICE = 1; SLICE < TIME_SLICES; ++SLICE)
-        {
-            CTA_SYNC();
-
-            if (warp_id == SLICE)
-            {
-                #pragma unroll
-                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-                {
-                    int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD);
-                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-                    temp_storage.buff[item_offset] = input_items[ITEM];
-                }
-
-                WARP_SYNC(0xffffffff);
-
-                #pragma unroll
-                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-                {
-                    int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
-                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-                    output_items[ITEM] = temp_storage.buff[item_offset];
-                }
-            }
-        }
-    }
-
-
-    /**
-     * Transposes data items from <em>striped</em> arrangement to <em>blocked</em> arrangement.  Specialized for no timeslicing.
-     */
-    template <typename OutputT>
-    __device__ __forceinline__ void StripedToBlocked(
-        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        Int2Type<false> /*time_slicing*/)
-    {
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
-            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-            temp_storage.buff[item_offset] = input_items[ITEM];
-        }
-
-        CTA_SYNC();
-
-        // No timeslicing
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
-            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-            output_items[ITEM] = temp_storage.buff[item_offset];
-        }
-    }
-
-
-    /**
-     * Transposes data items from <em>striped</em> arrangement to <em>blocked</em> arrangement.  Specialized for warp-timeslicing.
-     */
-    template <typename OutputT>
-    __device__ __forceinline__ void StripedToBlocked(
-        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        Int2Type<true>  /*time_slicing*/)
-    {
-        // Warp time-slicing
-        InputT temp_items[ITEMS_PER_THREAD];
-
-        #pragma unroll
-        for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
-        {
-            const int SLICE_OFFSET  = SLICE * TIME_SLICED_ITEMS;
-            const int SLICE_OOB     = SLICE_OFFSET + TIME_SLICED_ITEMS;
-
-            CTA_SYNC();
-
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-            {
-                // Write a strip of items
-                const int STRIP_OFFSET  = ITEM * BLOCK_THREADS;
-                const int STRIP_OOB     = STRIP_OFFSET + BLOCK_THREADS;
-
-                if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))
-                {
-                    int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET;
-                    if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
-                    {
-                        if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-                        temp_storage.buff[item_offset] = input_items[ITEM];
-                    }
-                }
-            }
-
-            CTA_SYNC();
-
-            if (warp_id == SLICE)
-            {
-                #pragma unroll
-                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-                {
-                    int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM;
-                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-                    temp_items[ITEM] = temp_storage.buff[item_offset];
-                }
-            }
-        }
-
-        // Copy
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            output_items[ITEM] = temp_items[ITEM];
-        }
-    }
-
-
-    /**
-     * Transposes data items from <em>warp-striped</em> arrangement to <em>blocked</em> arrangement.  Specialized for no timeslicing
-     */
-    template <typename OutputT>
-    __device__ __forceinline__ void WarpStripedToBlocked(
-        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        Int2Type<false> /*time_slicing*/)
-    {
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
-            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-            temp_storage.buff[item_offset] = input_items[ITEM];
-        }
-
-        WARP_SYNC(0xffffffff);
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD);
-            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-            output_items[ITEM] = temp_storage.buff[item_offset];
-        }
-    }
-
-
-    /**
-     * Transposes data items from <em>warp-striped</em> arrangement to <em>blocked</em> arrangement.  Specialized for warp-timeslicing
-     */
-    template <typename OutputT>
-    __device__ __forceinline__ void WarpStripedToBlocked(
-        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        Int2Type<true>  /*time_slicing*/)
-    {
-        #pragma unroll
-        for (unsigned int SLICE = 0; SLICE < TIME_SLICES; ++SLICE)
-        {
-            CTA_SYNC();
-
-            if (warp_id == SLICE)
-            {
-                #pragma unroll
-                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-                {
-                    int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
-                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-                    temp_storage.buff[item_offset] = input_items[ITEM];
-                }
-
-                WARP_SYNC(0xffffffff);
-
-                #pragma unroll
-                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-                {
-                    int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD);
-                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-                    output_items[ITEM] = temp_storage.buff[item_offset];
-                }
-            }
-        }
-    }
-
-
-    /**
-     * Exchanges data items annotated by rank into <em>blocked</em> arrangement.  Specialized for no timeslicing.
-     */
-    template <typename OutputT, typename OffsetT>
-    __device__ __forceinline__ void ScatterToBlocked(
-        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        OffsetT         ranks[ITEMS_PER_THREAD],    ///< [in] Corresponding scatter ranks
-        Int2Type<false> /*time_slicing*/)
-    {
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = ranks[ITEM];
-            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
-            temp_storage.buff[item_offset] = input_items[ITEM];
-        }
-
-        CTA_SYNC();
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
-            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
-            output_items[ITEM] = temp_storage.buff[item_offset];
-        }
-    }
-
-    /**
-     * Exchanges data items annotated by rank into <em>blocked</em> arrangement.  Specialized for warp-timeslicing.
-     */
-    template <typename OutputT, typename OffsetT>
-    __device__ __forceinline__ void ScatterToBlocked(
-        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        OffsetT         ranks[ITEMS_PER_THREAD],    ///< [in] Corresponding scatter ranks
-        Int2Type<true>  /*time_slicing*/)
-    {
-        InputT temp_items[ITEMS_PER_THREAD];
-
-        #pragma unroll
-        for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
-        {
-            CTA_SYNC();
-
-            const int SLICE_OFFSET = TIME_SLICED_ITEMS * SLICE;
-
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-            {
-                int item_offset = ranks[ITEM] - SLICE_OFFSET;
-                if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS))
-                {
-                    if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
-                    temp_storage.buff[item_offset] = input_items[ITEM];
-                }
-            }
-
-            CTA_SYNC();
-
-            if (warp_id == SLICE)
-            {
-                #pragma unroll
-                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-                {
-                    int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM;
-                    if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
-                    temp_items[ITEM] = temp_storage.buff[item_offset];
-                }
-            }
-        }
-
-        // Copy
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            output_items[ITEM] = temp_items[ITEM];
-        }
-    }
-
-
-    /**
-     * Exchanges data items annotated by rank into <em>striped</em> arrangement.  Specialized for no timeslicing.
-     */
-    template <typename OutputT, typename OffsetT>
-    __device__ __forceinline__ void ScatterToStriped(
-        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        OffsetT         ranks[ITEMS_PER_THREAD],    ///< [in] Corresponding scatter ranks
-        Int2Type<false> /*time_slicing*/)
-    {
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = ranks[ITEM];
-            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
-            temp_storage.buff[item_offset] = input_items[ITEM];
-        }
-
-        CTA_SYNC();
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
-            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
-            output_items[ITEM] = temp_storage.buff[item_offset];
-        }
-    }
-
-
-    /**
-     * Exchanges data items annotated by rank into <em>striped</em> arrangement.  Specialized for warp-timeslicing.
-     */
-    template <typename OutputT, typename OffsetT>
-    __device__ __forceinline__ void ScatterToStriped(
-        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        OffsetT         ranks[ITEMS_PER_THREAD],    ///< [in] Corresponding scatter ranks
-        Int2Type<true> /*time_slicing*/)
-    {
-        InputT temp_items[ITEMS_PER_THREAD];
-
-        #pragma unroll
-        for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
-        {
-            const int SLICE_OFFSET  = SLICE * TIME_SLICED_ITEMS;
-            const int SLICE_OOB     = SLICE_OFFSET + TIME_SLICED_ITEMS;
-
-            CTA_SYNC();
-
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-            {
-                int item_offset = ranks[ITEM] - SLICE_OFFSET;
-                if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS))
-                {
-                    if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
-                    temp_storage.buff[item_offset] = input_items[ITEM];
-                }
-            }
-
-            CTA_SYNC();
-
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-            {
-                // Read a strip of items
-                const int STRIP_OFFSET  = ITEM * BLOCK_THREADS;
-                const int STRIP_OOB     = STRIP_OFFSET + BLOCK_THREADS;
-
-                if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))
-                {
-                    int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET;
-                    if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
-                    {
-                        if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-                        temp_items[ITEM] = temp_storage.buff[item_offset];
-                    }
-                }
-            }
-        }
-
-        // Copy
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            output_items[ITEM] = temp_items[ITEM];
-        }
-    }
-
-
-public:
-
-    /******************************************************************//**
-     * \name Collective constructors
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
-     */
-    __device__ __forceinline__ BlockExchange()
-    :
-        temp_storage(PrivateStorage()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
-        warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
-        lane_id(LaneId()),
-        warp_offset(warp_id * WARP_TIME_SLICED_ITEMS)
-    {}
-
-
-    /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.
-     */
-    __device__ __forceinline__ BlockExchange(
-        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
-        lane_id(LaneId()),
-        warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
-        warp_offset(warp_id * WARP_TIME_SLICED_ITEMS)
-    {}
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Structured exchanges
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Transposes data items from <em>striped</em> arrangement to <em>blocked</em> arrangement.
-     *
-     * \par
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates the conversion from a "striped" to a "blocked" arrangement
-     * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
-     *
-     * __global__ void ExampleKernel(int *d_data, ...)
-     * {
-     *     // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
-     *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
-     *
-     *     // Allocate shared memory for BlockExchange
-     *     __shared__ typename BlockExchange::TempStorage temp_storage;
-     *
-     *     // Load a tile of ordered data into a striped arrangement across block threads
-     *     int thread_data[4];
-     *     cub::LoadDirectStriped<128>(threadIdx.x, d_data, thread_data);
-     *
-     *     // Collectively exchange data into a blocked arrangement across threads
-     *     BlockExchange(temp_storage).StripedToBlocked(thread_data, thread_data);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of striped input \p thread_data across the block of threads is
-     * <tt>{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }</tt> after loading from device-accessible memory.
-     * The corresponding output \p thread_data in those threads will be
-     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
-     *
-     */
-    template <typename OutputT>
-    __device__ __forceinline__ void StripedToBlocked(
-        InputT      input_items[ITEMS_PER_THREAD],    ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-        OutputT     output_items[ITEMS_PER_THREAD])   ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-    {
-        StripedToBlocked(input_items, output_items, Int2Type<WARP_TIME_SLICING>());
-    }
-
-
-    /**
-     * \brief Transposes data items from <em>blocked</em> arrangement to <em>striped</em> arrangement.
-     *
-     * \par
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement
-     * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
-     *
-     * __global__ void ExampleKernel(int *d_data, ...)
-     * {
-     *     // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
-     *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
-     *
-     *     // Allocate shared memory for BlockExchange
-     *     __shared__ typename BlockExchange::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively exchange data into a striped arrangement across threads
-     *     BlockExchange(temp_storage).BlockedToStriped(thread_data, thread_data);
-     *
-     *     // Store data striped across block threads into an ordered tile
-     *     cub::StoreDirectStriped<STORE_DEFAULT, 128>(threadIdx.x, d_data, thread_data);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of blocked input \p thread_data across the block of threads is
-     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
-     * The corresponding output \p thread_data in those threads will be
-     * <tt>{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }</tt> in
-     * preparation for storing to device-accessible memory.
-     *
-     */
-    template <typename OutputT>
-    __device__ __forceinline__ void BlockedToStriped(
-        InputT      input_items[ITEMS_PER_THREAD],    ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-        OutputT     output_items[ITEMS_PER_THREAD])   ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-    {
-        BlockedToStriped(input_items, output_items, Int2Type<WARP_TIME_SLICING>());
-    }
-
-
-
-    /**
-     * \brief Transposes data items from <em>warp-striped</em> arrangement to <em>blocked</em> arrangement.
-     *
-     * \par
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates the conversion from a "warp-striped" to a "blocked" arrangement
-     * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
-     *
-     * __global__ void ExampleKernel(int *d_data, ...)
-     * {
-     *     // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
-     *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
-     *
-     *     // Allocate shared memory for BlockExchange
-     *     __shared__ typename BlockExchange::TempStorage temp_storage;
-     *
-     *     // Load a tile of ordered data into a warp-striped arrangement across warp threads
-     *     int thread_data[4];
-     *     cub::LoadSWarptriped<LOAD_DEFAULT>(threadIdx.x, d_data, thread_data);
-     *
-     *     // Collectively exchange data into a blocked arrangement across threads
-     *     BlockExchange(temp_storage).WarpStripedToBlocked(thread_data);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of warp-striped input \p thread_data across the block of threads is
-     * <tt>{ [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }</tt>
-     * after loading from device-accessible memory.  (The first 128 items are striped across
-     * the first warp of 32 threads, the second 128 items are striped across the second warp, etc.)
-     * The corresponding output \p thread_data in those threads will be
-     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
-     *
-     */
-    template <typename OutputT>
-    __device__ __forceinline__ void WarpStripedToBlocked(
-        InputT      input_items[ITEMS_PER_THREAD],    ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-        OutputT     output_items[ITEMS_PER_THREAD])   ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-    {
-        WarpStripedToBlocked(input_items, output_items, Int2Type<WARP_TIME_SLICING>());
-    }
-
-
-
-    /**
-     * \brief Transposes data items from <em>blocked</em> arrangement to <em>warp-striped</em> arrangement.
-     *
-     * \par
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates the conversion from a "blocked" to a "warp-striped" arrangement
-     * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
-     *
-     * __global__ void ExampleKernel(int *d_data, ...)
-     * {
-     *     // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
-     *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
-     *
-     *     // Allocate shared memory for BlockExchange
-     *     __shared__ typename BlockExchange::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively exchange data into a warp-striped arrangement across threads
-     *     BlockExchange(temp_storage).BlockedToWarpStriped(thread_data, thread_data);
-     *
-     *     // Store data striped across warp threads into an ordered tile
-     *     cub::StoreDirectStriped<STORE_DEFAULT, 128>(threadIdx.x, d_data, thread_data);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of blocked input \p thread_data across the block of threads is
-     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
-     * The corresponding output \p thread_data in those threads will be
-     * <tt>{ [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }</tt>
-     * in preparation for storing to device-accessible memory. (The first 128 items are striped across
-     * the first warp of 32 threads, the second 128 items are striped across the second warp, etc.)
-     *
-     */
-    template <typename OutputT>
-    __device__ __forceinline__ void BlockedToWarpStriped(
-        InputT      input_items[ITEMS_PER_THREAD],    ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-        OutputT     output_items[ITEMS_PER_THREAD])   ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-    {
-        BlockedToWarpStriped(input_items, output_items, Int2Type<WARP_TIME_SLICING>());
-    }
-
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Scatter exchanges
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Exchanges data items annotated by rank into <em>blocked</em> arrangement.
-     *
-     * \par
-     * - \smemreuse
-     *
-     * \tparam OffsetT                              <b>[inferred]</b> Signed integer type for local offsets
-     */
-    template <typename OutputT, typename OffsetT>
-    __device__ __forceinline__ void ScatterToBlocked(
-        InputT      input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-        OutputT     output_items[ITEMS_PER_THREAD],     ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-        OffsetT     ranks[ITEMS_PER_THREAD])            ///< [in] Corresponding scatter ranks
-    {
-        ScatterToBlocked(input_items, output_items, ranks, Int2Type<WARP_TIME_SLICING>());
-    }
-
-
-
-    /**
-     * \brief Exchanges data items annotated by rank into <em>striped</em> arrangement.
-     *
-     * \par
-     * - \smemreuse
-     *
-     * \tparam OffsetT                              <b>[inferred]</b> Signed integer type for local offsets
-     */
-    template <typename OutputT, typename OffsetT>
-    __device__ __forceinline__ void ScatterToStriped(
-        InputT      input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-        OutputT     output_items[ITEMS_PER_THREAD],     ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-        OffsetT     ranks[ITEMS_PER_THREAD])            ///< [in] Corresponding scatter ranks
-    {
-        ScatterToStriped(input_items, output_items, ranks, Int2Type<WARP_TIME_SLICING>());
-    }
-
-
-
-    /**
-     * \brief Exchanges data items annotated by rank into <em>striped</em> arrangement.  Items with rank -1 are not exchanged.
-     *
-     * \par
-     * - \smemreuse
-     *
-     * \tparam OffsetT                              <b>[inferred]</b> Signed integer type for local offsets
-     */
-    template <typename OutputT, typename OffsetT>
-    __device__ __forceinline__ void ScatterToStripedGuarded(
-        InputT      input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-        OutputT     output_items[ITEMS_PER_THREAD],     ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-        OffsetT     ranks[ITEMS_PER_THREAD])            ///< [in] Corresponding scatter ranks
-    {
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = ranks[ITEM];
-            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
-            if (ranks[ITEM] >= 0)
-                temp_storage.buff[item_offset] = input_items[ITEM];
-        }
-
-        CTA_SYNC();
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
-            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
-            output_items[ITEM] = temp_storage.buff[item_offset];
-        }
-    }
-
-
-
-
-    /**
-     * \brief Exchanges valid data items annotated by rank into <em>striped</em> arrangement.
-     *
-     * \par
-     * - \smemreuse
-     *
-     * \tparam OffsetT                              <b>[inferred]</b> Signed integer type for local offsets
-     * \tparam ValidFlag                            <b>[inferred]</b> FlagT type denoting which items are valid
-     */
-    template <typename OutputT, typename OffsetT, typename ValidFlag>
-    __device__ __forceinline__ void ScatterToStripedFlagged(
-        InputT      input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-        OutputT     output_items[ITEMS_PER_THREAD],     ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-        OffsetT     ranks[ITEMS_PER_THREAD],            ///< [in] Corresponding scatter ranks
-        ValidFlag   is_valid[ITEMS_PER_THREAD])         ///< [in] Corresponding flag denoting item validity
-    {
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = ranks[ITEM];
-            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
-            if (is_valid[ITEM])
-                temp_storage.buff[item_offset] = input_items[ITEM];
-        }
-
-        CTA_SYNC();
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
-            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
-            output_items[ITEM] = temp_storage.buff[item_offset];
-        }
-    }
-
-
-    //@}  end member group
-
-
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-
-    __device__ __forceinline__ void StripedToBlocked(
-        InputT      items[ITEMS_PER_THREAD])   ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-    {
-        StripedToBlocked(items, items);
-    }
-
-    __device__ __forceinline__ void BlockedToStriped(
-        InputT      items[ITEMS_PER_THREAD])   ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-    {
-        BlockedToStriped(items, items);
-    }
-
-    __device__ __forceinline__ void WarpStripedToBlocked(
-        InputT      items[ITEMS_PER_THREAD])    ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-    {
-        WarpStripedToBlocked(items, items);
-    }
-
-    __device__ __forceinline__ void BlockedToWarpStriped(
-        InputT      items[ITEMS_PER_THREAD])    ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-    {
-        BlockedToWarpStriped(items, items);
-    }
-
-    template <typename OffsetT>
-    __device__ __forceinline__ void ScatterToBlocked(
-        InputT      items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-        OffsetT     ranks[ITEMS_PER_THREAD])    ///< [in] Corresponding scatter ranks
-    {
-        ScatterToBlocked(items, items, ranks);
-    }
-
-    template <typename OffsetT>
-    __device__ __forceinline__ void ScatterToStriped(
-        InputT      items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-        OffsetT     ranks[ITEMS_PER_THREAD])    ///< [in] Corresponding scatter ranks
-    {
-        ScatterToStriped(items, items, ranks);
-    }
-
-    template <typename OffsetT>
-    __device__ __forceinline__ void ScatterToStripedGuarded(
-        InputT      items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-        OffsetT     ranks[ITEMS_PER_THREAD])    ///< [in] Corresponding scatter ranks
-    {
-        ScatterToStripedGuarded(items, items, ranks);
-    }
-
-    template <typename OffsetT, typename ValidFlag>
-    __device__ __forceinline__ void ScatterToStripedFlagged(
-        InputT      items[ITEMS_PER_THREAD],        ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-        OffsetT     ranks[ITEMS_PER_THREAD],        ///< [in] Corresponding scatter ranks
-        ValidFlag   is_valid[ITEMS_PER_THREAD])     ///< [in] Corresponding flag denoting item validity
-    {
-        ScatterToStriped(items, items, ranks, is_valid);
-    }
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-};
-
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-
-template <
-    typename    T,
-    int         ITEMS_PER_THREAD,
-    int         LOGICAL_WARP_THREADS    = CUB_PTX_WARP_THREADS,
-    int         PTX_ARCH                = CUB_PTX_ARCH>
-class WarpExchange
-{
-private:
-
-    /******************************************************************************
-     * Constants
-     ******************************************************************************/
-
-    /// Constants
-    enum
-    {
-        // Whether the logical warp size and the PTX warp size coincide
-        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
-
-        WARP_ITEMS                  = (ITEMS_PER_THREAD * LOGICAL_WARP_THREADS) + 1,
-
-        LOG_SMEM_BANKS              = CUB_LOG_SMEM_BANKS(PTX_ARCH),
-        SMEM_BANKS                  = 1 << LOG_SMEM_BANKS,
-
-        // Insert padding if the number of items per thread is a power of two and > 4 (otherwise we can typically use 128b loads)
-        INSERT_PADDING              = (ITEMS_PER_THREAD > 4) && (PowerOfTwo<ITEMS_PER_THREAD>::VALUE),
-        PADDING_ITEMS               = (INSERT_PADDING) ? (WARP_ITEMS >> LOG_SMEM_BANKS) : 0,
-    };
-
-    /******************************************************************************
-     * Type definitions
-     ******************************************************************************/
-
-    /// Shared memory storage layout type
-    struct _TempStorage
-    {
-        T buff[WARP_ITEMS + PADDING_ITEMS];
-    };
-
-public:
-
-    /// \smemstorage{WarpExchange}
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-private:
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    _TempStorage    &temp_storage;
-    int             lane_id;
-
-public:
-
-    /******************************************************************************
-     * Construction
-     ******************************************************************************/
-
-    /// Constructor
-    __device__ __forceinline__ WarpExchange(
-        TempStorage &temp_storage)
-    :
-        temp_storage(temp_storage.Alias()),
-        lane_id(IS_ARCH_WARP ?
-            LaneId() :
-            LaneId() % LOGICAL_WARP_THREADS)
-    {}
-
-
-    /******************************************************************************
-     * Interface
-     ******************************************************************************/
-
-    /**
-     * \brief Exchanges valid data items annotated by rank into <em>striped</em> arrangement.
-     *
-     * \par
-     * - \smemreuse
-     *
-     * \tparam OffsetT                              <b>[inferred]</b> Signed integer type for local offsets
-     */
-    template <typename OffsetT>
-    __device__ __forceinline__ void ScatterToStriped(
-        T               items[ITEMS_PER_THREAD],        ///< [in-out] Items to exchange
-        OffsetT         ranks[ITEMS_PER_THREAD])        ///< [in] Corresponding scatter ranks
-    {
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            if (INSERT_PADDING) ranks[ITEM] = SHR_ADD(ranks[ITEM], LOG_SMEM_BANKS, ranks[ITEM]);
-            temp_storage.buff[ranks[ITEM]] = items[ITEM];
-        }
-
-        WARP_SYNC(0xffffffff);
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = (ITEM * LOGICAL_WARP_THREADS) + lane_id;
-            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
-            items[ITEM] = temp_storage.buff[item_offset];
-        }
-    }
-
-};
-
-
-
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-
-
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/block/block_histogram.cuh b/thrust/system/cuda/detail/cub/block/block_histogram.cuh
deleted file mode 100644
index f97f89ea6..000000000
--- a/thrust/system/cuda/detail/cub/block/block_histogram.cuh
+++ /dev/null
@@ -1,415 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * The cub::BlockHistogram class provides [<em>collective</em>](index.html#sec0) methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
- */
-
-#pragma once
-
-#include "specializations/block_histogram_sort.cuh"
-#include "specializations/block_histogram_atomic.cuh"
-#include "../util_ptx.cuh"
-#include "../util_arch.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Algorithmic variants
- ******************************************************************************/
-
-/**
- * \brief BlockHistogramAlgorithm enumerates alternative algorithms for the parallel construction of block-wide histograms.
- */
-enum BlockHistogramAlgorithm
-{
-
-    /**
-     * \par Overview
-     * Sorting followed by differentiation.  Execution is comprised of two phases:
-     * -# Sort the data using efficient radix sort
-     * -# Look for "runs" of same-valued keys by detecting discontinuities; the run-lengths are histogram bin counts.
-     *
-     * \par Performance Considerations
-     * Delivers consistent throughput regardless of sample bin distribution.
-     */
-    BLOCK_HISTO_SORT,
-
-
-    /**
-     * \par Overview
-     * Use atomic addition to update byte counts directly
-     *
-     * \par Performance Considerations
-     * Performance is strongly tied to the hardware implementation of atomic
-     * addition, and may be significantly degraded for non uniformly-random
-     * input distributions where many concurrent updates are likely to be
-     * made to the same bin counter.
-     */
-    BLOCK_HISTO_ATOMIC,
-};
-
-
-
-/******************************************************************************
- * Block histogram
- ******************************************************************************/
-
-
-/**
- * \brief The BlockHistogram class provides [<em>collective</em>](index.html#sec0) methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. ![](histogram_logo.png)
- * \ingroup BlockModule
- *
- * \tparam T                    The sample type being histogrammed (must be castable to an integer bin identifier)
- * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
- * \tparam ITEMS_PER_THREAD     The number of items per thread
- * \tparam BINS                 The number bins within the histogram
- * \tparam ALGORITHM            <b>[optional]</b> cub::BlockHistogramAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_HISTO_SORT)
- * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
- * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
- *
- * \par Overview
- * - A <a href="http://en.wikipedia.org/wiki/Histogram"><em>histogram</em></a>
- *   counts the number of observations that fall into each of the disjoint categories (known as <em>bins</em>).
- * - BlockHistogram can be optionally specialized to use different algorithms:
- *   -# <b>cub::BLOCK_HISTO_SORT</b>.  Sorting followed by differentiation. [More...](\ref cub::BlockHistogramAlgorithm)
- *   -# <b>cub::BLOCK_HISTO_ATOMIC</b>.  Use atomic addition to update byte counts directly. [More...](\ref cub::BlockHistogramAlgorithm)
- *
- * \par Performance Considerations
- * - \granularity
- *
- * \par A Simple Example
- * \blockcollective{BlockHistogram}
- * \par
- * The code snippet below illustrates a 256-bin histogram of 512 integer samples that
- * are partitioned across 128 threads where each thread owns 4 samples.
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/block/block_histogram.cuh>
- *
- * __global__ void ExampleKernel(...)
- * {
- *     // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each
- *     typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
- *
- *     // Allocate shared memory for BlockHistogram
- *     __shared__ typename BlockHistogram::TempStorage temp_storage;
- *
- *     // Allocate shared memory for block-wide histogram bin counts
- *     __shared__ unsigned int smem_histogram[256];
- *
- *     // Obtain input samples per thread
- *     unsigned char data[4];
- *     ...
- *
- *     // Compute the block-wide histogram
- *     BlockHistogram(temp_storage).Histogram(data, smem_histogram);
- *
- * \endcode
- *
- * \par Performance and Usage Considerations
- * - The histogram output can be constructed in shared or device-accessible memory
- * - See cub::BlockHistogramAlgorithm for performance details regarding algorithmic alternatives
- *
- */
-template <
-    typename                T,
-    int                     BLOCK_DIM_X,
-    int                     ITEMS_PER_THREAD,
-    int                     BINS,
-    BlockHistogramAlgorithm ALGORITHM           = BLOCK_HISTO_SORT,
-    int                     BLOCK_DIM_Y         = 1,
-    int                     BLOCK_DIM_Z         = 1,
-    int                     PTX_ARCH            = CUB_PTX_ARCH>
-class BlockHistogram
-{
-private:
-
-    /******************************************************************************
-     * Constants and type definitions
-     ******************************************************************************/
-
-    /// Constants
-    enum
-    {
-        /// The thread block size in threads
-        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-    };
-
-    /**
-     * Ensure the template parameterization meets the requirements of the
-     * targeted device architecture.  BLOCK_HISTO_ATOMIC can only be used
-     * on version SM120 or later.  Otherwise BLOCK_HISTO_SORT is used
-     * regardless.
-     */
-    static const BlockHistogramAlgorithm SAFE_ALGORITHM =
-        ((ALGORITHM == BLOCK_HISTO_ATOMIC) && (PTX_ARCH < 120)) ?
-            BLOCK_HISTO_SORT :
-            ALGORITHM;
-
-    /// Internal specialization.
-    typedef typename If<(SAFE_ALGORITHM == BLOCK_HISTO_SORT),
-        BlockHistogramSort<T, BLOCK_DIM_X, ITEMS_PER_THREAD, BINS, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH>,
-        BlockHistogramAtomic<BINS> >::Type InternalBlockHistogram;
-
-    /// Shared memory storage layout type for BlockHistogram
-    typedef typename InternalBlockHistogram::TempStorage _TempStorage;
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    /// Shared storage reference
-    _TempStorage &temp_storage;
-
-    /// Linear thread-id
-    unsigned int linear_tid;
-
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-    /// Internal storage allocator
-    __device__ __forceinline__ _TempStorage& PrivateStorage()
-    {
-        __shared__ _TempStorage private_storage;
-        return private_storage;
-    }
-
-
-public:
-
-    /// \smemstorage{BlockHistogram}
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    /******************************************************************//**
-     * \name Collective constructors
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
-     */
-    __device__ __forceinline__ BlockHistogram()
-    :
-        temp_storage(PrivateStorage()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.
-     */
-    __device__ __forceinline__ BlockHistogram(
-        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Histogram operations
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Initialize the shared histogram counters to zero.
-     *
-     * \par Snippet
-     * The code snippet below illustrates a the initialization and update of a
-     * histogram of 512 integer samples that are partitioned across 128 threads
-     * where each thread owns 4 samples.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_histogram.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each
-     *     typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
-     *
-     *     // Allocate shared memory for BlockHistogram
-     *     __shared__ typename BlockHistogram::TempStorage temp_storage;
-     *
-     *     // Allocate shared memory for block-wide histogram bin counts
-     *     __shared__ unsigned int smem_histogram[256];
-     *
-     *     // Obtain input samples per thread
-     *     unsigned char thread_samples[4];
-     *     ...
-     *
-     *     // Initialize the block-wide histogram
-     *     BlockHistogram(temp_storage).InitHistogram(smem_histogram);
-     *
-     *     // Update the block-wide histogram
-     *     BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram);
-     *
-     * \endcode
-     *
-     * \tparam CounterT              <b>[inferred]</b> Histogram counter type
-     */
-    template <typename CounterT     >
-    __device__ __forceinline__ void InitHistogram(CounterT      histogram[BINS])
-    {
-        // Initialize histogram bin counts to zeros
-        int histo_offset = 0;
-
-        #pragma unroll
-        for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
-        {
-            histogram[histo_offset + linear_tid] = 0;
-        }
-        // Finish up with guarded initialization if necessary
-        if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS))
-        {
-            histogram[histo_offset + linear_tid] = 0;
-        }
-    }
-
-
-    /**
-     * \brief Constructs a block-wide histogram in shared/device-accessible memory.  Each thread contributes an array of input elements.
-     *
-     * \par
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a 256-bin histogram of 512 integer samples that
-     * are partitioned across 128 threads where each thread owns 4 samples.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_histogram.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each
-     *     typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
-     *
-     *     // Allocate shared memory for BlockHistogram
-     *     __shared__ typename BlockHistogram::TempStorage temp_storage;
-     *
-     *     // Allocate shared memory for block-wide histogram bin counts
-     *     __shared__ unsigned int smem_histogram[256];
-     *
-     *     // Obtain input samples per thread
-     *     unsigned char thread_samples[4];
-     *     ...
-     *
-     *     // Compute the block-wide histogram
-     *     BlockHistogram(temp_storage).Histogram(thread_samples, smem_histogram);
-     *
-     * \endcode
-     *
-     * \tparam CounterT              <b>[inferred]</b> Histogram counter type
-     */
-    template <
-        typename            CounterT     >
-    __device__ __forceinline__ void Histogram(
-        T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
-        CounterT             histogram[BINS])                ///< [out] Reference to shared/device-accessible memory histogram
-    {
-        // Initialize histogram bin counts to zeros
-        InitHistogram(histogram);
-
-        CTA_SYNC();
-
-        // Composite the histogram
-        InternalBlockHistogram(temp_storage).Composite(items, histogram);
-    }
-
-
-
-    /**
-     * \brief Updates an existing block-wide histogram in shared/device-accessible memory.  Each thread composites an array of input elements.
-     *
-     * \par
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a the initialization and update of a
-     * histogram of 512 integer samples that are partitioned across 128 threads
-     * where each thread owns 4 samples.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_histogram.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each
-     *     typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
-     *
-     *     // Allocate shared memory for BlockHistogram
-     *     __shared__ typename BlockHistogram::TempStorage temp_storage;
-     *
-     *     // Allocate shared memory for block-wide histogram bin counts
-     *     __shared__ unsigned int smem_histogram[256];
-     *
-     *     // Obtain input samples per thread
-     *     unsigned char thread_samples[4];
-     *     ...
-     *
-     *     // Initialize the block-wide histogram
-     *     BlockHistogram(temp_storage).InitHistogram(smem_histogram);
-     *
-     *     // Update the block-wide histogram
-     *     BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram);
-     *
-     * \endcode
-     *
-     * \tparam CounterT              <b>[inferred]</b> Histogram counter type
-     */
-    template <
-        typename            CounterT     >
-    __device__ __forceinline__ void Composite(
-        T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
-        CounterT             histogram[BINS])                 ///< [out] Reference to shared/device-accessible memory histogram
-    {
-        InternalBlockHistogram(temp_storage).Composite(items, histogram);
-    }
-
-};
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/block/block_load.cuh b/thrust/system/cuda/detail/cub/block/block_load.cuh
deleted file mode 100644
index cca853346..000000000
--- a/thrust/system/cuda/detail/cub/block/block_load.cuh
+++ /dev/null
@@ -1,1230 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2016, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Operations for reading linear tiles of data into the CUDA thread block.
- */
-
-#pragma once
-
-#include <iterator>
-
-#include "block_exchange.cuh"
-#include "../iterator/cache_modified_input_iterator.cuh"
-#include "../util_ptx.cuh"
-#include "../util_macro.cuh"
-#include "../util_type.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \addtogroup UtilIo
- * @{
- */
-
-
-/******************************************************************//**
- * \name Blocked arrangement I/O (direct)
- *********************************************************************/
-//@{
-
-
-/**
- * \brief Load a linear segment of items into a blocked arrangement across the thread block.
- *
- * \blocked
- *
- * \tparam T                    <b>[inferred]</b> The data type to load.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
- */
-template <
-    typename        InputT,
-    int             ITEMS_PER_THREAD,
-    typename        InputIteratorT>
-__device__ __forceinline__ void LoadDirectBlocked(
-    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
-    InputT          (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
-{
-    // Load directly in thread-blocked order
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        items[ITEM] = block_itr[(linear_tid * ITEMS_PER_THREAD) + ITEM];
-    }
-}
-
-
-/**
- * \brief Load a linear segment of items into a blocked arrangement across the thread block, guarded by range.
- *
- * \blocked
- *
- * \tparam T                    <b>[inferred]</b> The data type to load.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
- */
-template <
-    typename        InputT,
-    int             ITEMS_PER_THREAD,
-    typename        InputIteratorT>
-__device__ __forceinline__ void LoadDirectBlocked(
-    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
-    InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
-    int             valid_items)                ///< [in] Number of valid items to load
-{
-
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        if ((linear_tid * ITEMS_PER_THREAD) + ITEM < valid_items)
-        {
-            items[ITEM] = block_itr[(linear_tid * ITEMS_PER_THREAD) + ITEM];
-        }
-    }
-}
-
-
-/**
- * \brief Load a linear segment of items into a blocked arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements..
- *
- * \blocked
- *
- * \tparam T                    <b>[inferred]</b> The data type to load.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
- */
-template <
-    typename        InputT,
-    typename        DefaultT,
-    int             ITEMS_PER_THREAD,
-    typename        InputIteratorT>
-__device__ __forceinline__ void LoadDirectBlocked(
-    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
-    InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
-    int             valid_items,                ///< [in] Number of valid items to load
-    DefaultT        oob_default)                ///< [in] Default value to assign out-of-bound items
-{
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        items[ITEM] = oob_default;
-
-    LoadDirectBlocked(linear_tid, block_itr, items, valid_items);
-}
-
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-/**
- * Internal implementation for load vectorization
- */
-template <
-    CacheLoadModifier   MODIFIER,
-    typename            T,
-    int                 ITEMS_PER_THREAD>
-__device__ __forceinline__ void InternalLoadDirectBlockedVectorized(
-    int    linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    T      *block_ptr,                 ///< [in] Input pointer for loading from
-    T      (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
-{
-    // Biggest memory access word that T is a whole multiple of
-    typedef typename UnitWord<T>::DeviceWord DeviceWord;
-
-    enum
-    {
-        TOTAL_WORDS = sizeof(items) / sizeof(DeviceWord),
-
-        VECTOR_SIZE = (TOTAL_WORDS % 4 == 0) ?
-            4 :
-            (TOTAL_WORDS % 2 == 0) ?
-                2 :
-                1,
-
-        VECTORS_PER_THREAD = TOTAL_WORDS / VECTOR_SIZE,
-    };
-
-    // Vector type
-    typedef typename CubVector<DeviceWord, VECTOR_SIZE>::Type Vector;
-
-    // Vector items
-    Vector vec_items[VECTORS_PER_THREAD];
-
-    // Aliased input ptr
-    Vector* vec_ptr = reinterpret_cast<Vector*>(block_ptr) + (linear_tid * VECTORS_PER_THREAD);
-
-    // Load directly in thread-blocked order
-    #pragma unroll
-    for (int ITEM = 0; ITEM < VECTORS_PER_THREAD; ITEM++)
-    {
-        vec_items[ITEM] = ThreadLoad<MODIFIER>(vec_ptr + ITEM);
-    }
-
-    // Copy
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        items[ITEM] = *(reinterpret_cast<T*>(vec_items) + ITEM);
-    }
-}
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-/**
- * \brief Load a linear segment of items into a blocked arrangement across the thread block.
- *
- * \blocked
- *
- * The input offset (\p block_ptr + \p block_offset) must be quad-item aligned
- *
- * The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT:
- *   - \p ITEMS_PER_THREAD is odd
- *   - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.)
- *
- * \tparam T                    <b>[inferred]</b> The data type to load.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- */
-template <
-    typename        T,
-    int             ITEMS_PER_THREAD>
-__device__ __forceinline__ void LoadDirectBlockedVectorized(
-    int linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    T   *block_ptr,                 ///< [in] Input pointer for loading from
-    T   (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
-{
-    InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_ptr, items);
-}
-
-
-//@}  end member group
-/******************************************************************//**
- * \name Striped arrangement I/O (direct)
- *********************************************************************/
-//@{
-
-
-/**
- * \brief Load a linear segment of items into a striped arrangement across the thread block.
- *
- * \striped
- *
- * \tparam BLOCK_THREADS        The thread block size in threads
- * \tparam T                    <b>[inferred]</b> The data type to load.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
- */
-template <
-    int             BLOCK_THREADS,
-    typename        InputT,
-    int             ITEMS_PER_THREAD,
-    typename        InputIteratorT>
-__device__ __forceinline__ void LoadDirectStriped(
-    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
-    InputT          (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
-{
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        items[ITEM] = block_itr[linear_tid + ITEM * BLOCK_THREADS];
-    }
-}
-
-
-/**
- * \brief Load a linear segment of items into a striped arrangement across the thread block, guarded by range
- *
- * \striped
- *
- * \tparam BLOCK_THREADS        The thread block size in threads
- * \tparam T                    <b>[inferred]</b> The data type to load.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
- */
-template <
-    int             BLOCK_THREADS,
-    typename        InputT,
-    int             ITEMS_PER_THREAD,
-    typename        InputIteratorT>
-__device__ __forceinline__ void LoadDirectStriped(
-    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
-    InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
-    int             valid_items)                ///< [in] Number of valid items to load
-{
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        if (linear_tid + (ITEM * BLOCK_THREADS) < valid_items)
-        {
-            items[ITEM] = block_itr[linear_tid + ITEM * BLOCK_THREADS];
-        }
-    }
-}
-
-
-/**
- * \brief Load a linear segment of items into a striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements.
- *
- * \striped
- *
- * \tparam BLOCK_THREADS        The thread block size in threads
- * \tparam T                    <b>[inferred]</b> The data type to load.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
- */
-template <
-    int             BLOCK_THREADS,
-    typename        InputT,
-    typename        DefaultT,
-    int             ITEMS_PER_THREAD,
-    typename        InputIteratorT>
-__device__ __forceinline__ void LoadDirectStriped(
-    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
-    InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
-    int             valid_items,                ///< [in] Number of valid items to load
-    DefaultT        oob_default)                ///< [in] Default value to assign out-of-bound items
-{
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        items[ITEM] = oob_default;
-
-    LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items);
-}
-
-
-
-//@}  end member group
-/******************************************************************//**
- * \name Warp-striped arrangement I/O (direct)
- *********************************************************************/
-//@{
-
-
-/**
- * \brief Load a linear segment of items into a warp-striped arrangement across the thread block.
- *
- * \warpstriped
- *
- * \par Usage Considerations
- * The number of threads in the thread block must be a multiple of the architecture's warp size.
- *
- * \tparam T                    <b>[inferred]</b> The data type to load.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
- */
-template <
-    typename        InputT,
-    int             ITEMS_PER_THREAD,
-    typename        InputIteratorT>
-__device__ __forceinline__ void LoadDirectWarpStriped(
-    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
-    InputT          (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
-{
-    int tid                = linear_tid & (CUB_PTX_WARP_THREADS - 1);
-    int wid                = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
-    int warp_offset        = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
-
-    // Load directly in warp-striped order
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        items[ITEM] = block_itr[warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS)];
-    }
-}
-
-
-/**
- * \brief Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range
- *
- * \warpstriped
- *
- * \par Usage Considerations
- * The number of threads in the thread block must be a multiple of the architecture's warp size.
- *
- * \tparam T                    <b>[inferred]</b> The data type to load.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
- */
-template <
-    typename        InputT,
-    int             ITEMS_PER_THREAD,
-    typename        InputIteratorT>
-__device__ __forceinline__ void LoadDirectWarpStriped(
-    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
-    InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
-    int             valid_items)                ///< [in] Number of valid items to load
-{
-    int tid                = linear_tid & (CUB_PTX_WARP_THREADS - 1);
-    int wid                = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
-    int warp_offset        = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
-
-    // Load directly in warp-striped order
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        if (warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS) < valid_items)
-        {
-            items[ITEM] = block_itr[warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS)];
-        }
-    }
-}
-
-
-/**
- * \brief Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements.
- *
- * \warpstriped
- *
- * \par Usage Considerations
- * The number of threads in the thread block must be a multiple of the architecture's warp size.
- *
- * \tparam T                    <b>[inferred]</b> The data type to load.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam InputIteratorT        <b>[inferred]</b> The random-access iterator type for input \iterator.
- */
-template <
-    typename        InputT,
-    typename        DefaultT,
-    int             ITEMS_PER_THREAD,
-    typename        InputIteratorT>
-__device__ __forceinline__ void LoadDirectWarpStriped(
-    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
-    InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
-    int             valid_items,                ///< [in] Number of valid items to load
-    DefaultT        oob_default)                ///< [in] Default value to assign out-of-bound items
-{
-    // Load directly in warp-striped order
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        items[ITEM] = oob_default;
-
-    LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items);
-}
-
-
-
-//@}  end member group
-
-/** @} */       // end group UtilIo
-
-
-
-//-----------------------------------------------------------------------------
-// Generic BlockLoad abstraction
-//-----------------------------------------------------------------------------
-
-/**
- * \brief cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment of data from memory into a blocked arrangement across a CUDA thread block.
- */
-
-/**
- * \brief cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment of data from memory into a blocked arrangement across a CUDA thread block.
- */
-enum BlockLoadAlgorithm
-{
-    /**
-     * \par Overview
-     *
-     * A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is read
-     * directly from memory.
-     *
-     * \par Performance Considerations
-     * - The utilization of memory transactions (coalescing) decreases as the
-     *   access stride between threads increases (i.e., the number items per thread).
-     */
-    BLOCK_LOAD_DIRECT,
-
-    /**
-     * \par Overview
-     *
-     * A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is read
-     * from memory using CUDA's built-in vectorized loads as a coalescing optimization.
-     * For example, <tt>ld.global.v4.s32</tt> instructions will be generated
-     * when \p T = \p int and \p ITEMS_PER_THREAD % 4 == 0.
-     *
-     * \par Performance Considerations
-     * - The utilization of memory transactions (coalescing) remains high until the the
-     *   access stride between threads (i.e., the number items per thread) exceeds the
-     *   maximum vector load width (typically 4 items or 64B, whichever is lower).
-     * - The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT:
-     *   - \p ITEMS_PER_THREAD is odd
-     *   - The \p InputIteratorTis not a simple pointer type
-     *   - The block input offset is not quadword-aligned
-     *   - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.)
-     */
-    BLOCK_LOAD_VECTORIZE,
-
-    /**
-     * \par Overview
-     *
-     * A [<em>striped arrangement</em>](index.html#sec5sec3) of data is read
-     * efficiently from memory and then locally transposed into a
-     * [<em>blocked arrangement</em>](index.html#sec5sec3).
-     *
-     * \par Performance Considerations
-     * - The utilization of memory transactions (coalescing) remains high regardless
-     *   of items loaded per thread.
-     * - The local reordering incurs slightly longer latencies and throughput than the
-     *   direct cub::BLOCK_LOAD_DIRECT and cub::BLOCK_LOAD_VECTORIZE alternatives.
-     */
-    BLOCK_LOAD_TRANSPOSE,
-
-
-    /**
-     * \par Overview
-     *
-     * A [<em>warp-striped arrangement</em>](index.html#sec5sec3) of data is
-     * read efficiently from memory and then locally transposed into a
-     * [<em>blocked arrangement</em>](index.html#sec5sec3).
-     *
-     * \par Usage Considerations
-     * - BLOCK_THREADS must be a multiple of WARP_THREADS
-     *
-     * \par Performance Considerations
-     * - The utilization of memory transactions (coalescing) remains high regardless
-     *   of items loaded per thread.
-     * - The local reordering incurs slightly larger latencies than the
-     *   direct cub::BLOCK_LOAD_DIRECT and cub::BLOCK_LOAD_VECTORIZE alternatives.
-     * - Provisions more shared storage, but incurs smaller latencies than the
-     *   BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED alternative.
-     */
-    BLOCK_LOAD_WARP_TRANSPOSE,
-
-
-    /**
-     * \par Overview
-     *
-     * Like \p BLOCK_LOAD_WARP_TRANSPOSE, a [<em>warp-striped arrangement</em>](index.html#sec5sec3)
-     * of data is read directly from memory and then is locally transposed into a
-     * [<em>blocked arrangement</em>](index.html#sec5sec3). To reduce the shared memory
-     * requirement, only one warp's worth of shared memory is provisioned and is
-     * subsequently time-sliced among warps.
-     *
-     * \par Usage Considerations
-     * - BLOCK_THREADS must be a multiple of WARP_THREADS
-     *
-     * \par Performance Considerations
-     * - The utilization of memory transactions (coalescing) remains high regardless
-     *   of items loaded per thread.
-     * - Provisions less shared memory temporary storage, but incurs larger
-     *   latencies than the BLOCK_LOAD_WARP_TRANSPOSE alternative.
-     */
-    BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED,
-};
-
-
-/**
- * \brief The BlockLoad class provides [<em>collective</em>](index.html#sec0) data movement methods for loading a linear segment of items from memory into a [<em>blocked arrangement</em>](index.html#sec5sec3) across a CUDA thread block.  ![](block_load_logo.png)
- * \ingroup BlockModule
- * \ingroup UtilIo
- *
- * \tparam InputT               The data type to read into (which must be convertible from the input iterator's value type).
- * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
- * \tparam ITEMS_PER_THREAD     The number of consecutive items partitioned onto each thread.
- * \tparam ALGORITHM            <b>[optional]</b> cub::BlockLoadAlgorithm tuning policy.  default: cub::BLOCK_LOAD_DIRECT.
- * \tparam WARP_TIME_SLICING    <b>[optional]</b> Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage). (default: false)
- * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
- * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
- *
- * \par Overview
- * - The BlockLoad class provides a single data movement abstraction that can be specialized
- *   to implement different cub::BlockLoadAlgorithm strategies.  This facilitates different
- *   performance policies for different architectures, data types, granularity sizes, etc.
- * - BlockLoad can be optionally specialized by different data movement strategies:
- *   -# <b>cub::BLOCK_LOAD_DIRECT</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
- *      of data is read directly from memory.  [More...](\ref cub::BlockLoadAlgorithm)
- *   -# <b>cub::BLOCK_LOAD_VECTORIZE</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
- *      of data is read directly from memory using CUDA's built-in vectorized loads as a
- *      coalescing optimization.    [More...](\ref cub::BlockLoadAlgorithm)
- *   -# <b>cub::BLOCK_LOAD_TRANSPOSE</b>.  A [<em>striped arrangement</em>](index.html#sec5sec3)
- *      of data is read directly from memory and is then locally transposed into a
- *      [<em>blocked arrangement</em>](index.html#sec5sec3).  [More...](\ref cub::BlockLoadAlgorithm)
- *   -# <b>cub::BLOCK_LOAD_WARP_TRANSPOSE</b>.  A [<em>warp-striped arrangement</em>](index.html#sec5sec3)
- *      of data is read directly from memory and is then locally transposed into a
- *      [<em>blocked arrangement</em>](index.html#sec5sec3).  [More...](\ref cub::BlockLoadAlgorithm)
- *   -# <b>cub::BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED,</b>.  A [<em>warp-striped arrangement</em>](index.html#sec5sec3)
- *      of data is read directly from memory and is then locally transposed into a
- *      [<em>blocked arrangement</em>](index.html#sec5sec3) one warp at a time.  [More...](\ref cub::BlockLoadAlgorithm)
- * - \rowmajor
- *
- * \par A Simple Example
- * \blockcollective{BlockLoad}
- * \par
- * The code snippet below illustrates the loading of a linear
- * segment of 512 integers into a "blocked" arrangement across 128 threads where each
- * thread owns 4 consecutive items.  The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
- * meaning memory references are efficiently coalesced using a warp-striped access
- * pattern (after which items are locally reordered among threads).
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
- *
- * __global__ void ExampleKernel(int *d_data, ...)
- * {
- *     // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
- *     typedef cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
- *
- *     // Allocate shared memory for BlockLoad
- *     __shared__ typename BlockLoad::TempStorage temp_storage;
- *
- *     // Load a segment of consecutive items that are blocked across threads
- *     int thread_data[4];
- *     BlockLoad(temp_storage).Load(d_data, thread_data);
- *
- * \endcode
- * \par
- * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, 5, ...</tt>.
- * The set of \p thread_data across the block of threads in those threads will be
- * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
- *
- */
-template <
-    typename            InputT,
-    int                 BLOCK_DIM_X,
-    int                 ITEMS_PER_THREAD,
-    BlockLoadAlgorithm  ALGORITHM           = BLOCK_LOAD_DIRECT,
-    int                 BLOCK_DIM_Y         = 1,
-    int                 BLOCK_DIM_Z         = 1,
-    int                 PTX_ARCH            = CUB_PTX_ARCH>
-class BlockLoad
-{
-private:
-
-    /******************************************************************************
-     * Constants and typed definitions
-     ******************************************************************************/
-
-    /// Constants
-    enum
-    {
-        /// The thread block size in threads
-        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-    };
-
-
-    /******************************************************************************
-     * Algorithmic variants
-     ******************************************************************************/
-
-    /// Load helper
-    template <BlockLoadAlgorithm _POLICY, int DUMMY>
-    struct LoadInternal;
-
-
-    /**
-     * BLOCK_LOAD_DIRECT specialization of load helper
-     */
-    template <int DUMMY>
-    struct LoadInternal<BLOCK_LOAD_DIRECT, DUMMY>
-    {
-        /// Shared memory storage layout type
-        typedef NullType TempStorage;
-
-        /// Linear thread-id
-        int linear_tid;
-
-        /// Constructor
-        __device__ __forceinline__ LoadInternal(
-            TempStorage &/*temp_storage*/,
-            int linear_tid)
-        :
-            linear_tid(linear_tid)
-        {}
-
-        /// Load a linear segment of items from memory
-        template <typename InputIteratorT>
-        __device__ __forceinline__ void Load(
-            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            InputT          (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
-        {
-            LoadDirectBlocked(linear_tid, block_itr, items);
-        }
-
-        /// Load a linear segment of items from memory, guarded by range
-        template <typename InputIteratorT>
-        __device__ __forceinline__ void Load(
-            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
-            int             valid_items)                    ///< [in] Number of valid items to load
-        {
-            LoadDirectBlocked(linear_tid, block_itr, items, valid_items);
-        }
-
-        /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
-        template <typename InputIteratorT, typename DefaultT>
-        __device__ __forceinline__ void Load(
-            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
-            int             valid_items,                    ///< [in] Number of valid items to load
-            DefaultT        oob_default)                    ///< [in] Default value to assign out-of-bound items
-        {
-            LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default);
-        }
-
-    };
-
-
-    /**
-     * BLOCK_LOAD_VECTORIZE specialization of load helper
-     */
-    template <int DUMMY>
-    struct LoadInternal<BLOCK_LOAD_VECTORIZE, DUMMY>
-    {
-        /// Shared memory storage layout type
-        typedef NullType TempStorage;
-
-        /// Linear thread-id
-        int linear_tid;
-
-        /// Constructor
-        __device__ __forceinline__ LoadInternal(
-            TempStorage &/*temp_storage*/,
-            int linear_tid)
-        :
-            linear_tid(linear_tid)
-        {}
-
-        /// Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization)
-        template <typename InputIteratorT>
-        __device__ __forceinline__ void Load(
-            InputT               *block_ptr,                     ///< [in] The thread block's base input iterator for loading from
-            InputT               (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
-        {
-            InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_ptr, items);
-        }
-
-        /// Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization)
-        template <typename InputIteratorT>
-        __device__ __forceinline__ void Load(
-            const InputT         *block_ptr,                     ///< [in] The thread block's base input iterator for loading from
-            InputT               (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
-        {
-            InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_ptr, items);
-        }
-
-        /// Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization)
-        template <
-            CacheLoadModifier   MODIFIER,
-            typename            ValueType,
-            typename            OffsetT>
-        __device__ __forceinline__ void Load(
-            CacheModifiedInputIterator<MODIFIER, ValueType, OffsetT>    block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            InputT                                                     (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
-        {
-            InternalLoadDirectBlockedVectorized<MODIFIER>(linear_tid, block_itr.ptr, items);
-        }
-
-        /// Load a linear segment of items from memory, specialized for opaque input iterators (skips vectorization)
-        template <typename _InputIteratorT>
-        __device__ __forceinline__ void Load(
-            _InputIteratorT   block_itr,                    ///< [in] The thread block's base input iterator for loading from
-            InputT           (&items)[ITEMS_PER_THREAD])   ///< [out] Data to load
-        {
-            LoadDirectBlocked(linear_tid, block_itr, items);
-        }
-
-        /// Load a linear segment of items from memory, guarded by range (skips vectorization)
-        template <typename InputIteratorT>
-        __device__ __forceinline__ void Load(
-            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
-            int             valid_items)                    ///< [in] Number of valid items to load
-        {
-            LoadDirectBlocked(linear_tid, block_itr, items, valid_items);
-        }
-
-        /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements (skips vectorization)
-        template <typename InputIteratorT, typename DefaultT>
-        __device__ __forceinline__ void Load(
-            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
-            int             valid_items,                    ///< [in] Number of valid items to load
-            DefaultT        oob_default)                    ///< [in] Default value to assign out-of-bound items
-        {
-            LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default);
-        }
-
-    };
-
-
-    /**
-     * BLOCK_LOAD_TRANSPOSE specialization of load helper
-     */
-    template <int DUMMY>
-    struct LoadInternal<BLOCK_LOAD_TRANSPOSE, DUMMY>
-    {
-        // BlockExchange utility type for keys
-        typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
-
-        /// Shared memory storage layout type
-        struct _TempStorage : BlockExchange::TempStorage
-        {};
-
-        /// Alias wrapper allowing storage to be unioned
-        struct TempStorage : Uninitialized<_TempStorage> {};
-
-        /// Thread reference to shared storage
-        _TempStorage &temp_storage;
-
-        /// Linear thread-id
-        int linear_tid;
-
-        /// Constructor
-        __device__ __forceinline__ LoadInternal(
-            TempStorage &temp_storage,
-            int linear_tid)
-        :
-            temp_storage(temp_storage.Alias()),
-            linear_tid(linear_tid)
-        {}
-
-        /// Load a linear segment of items from memory
-        template <typename InputIteratorT>
-        __device__ __forceinline__ void Load(
-            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            InputT          (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load{
-        {
-            LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items);
-            BlockExchange(temp_storage).StripedToBlocked(items, items);
-        }
-
-        /// Load a linear segment of items from memory, guarded by range
-        template <typename InputIteratorT>
-        __device__ __forceinline__ void Load(
-            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
-            int             valid_items)                    ///< [in] Number of valid items to load
-        {
-            LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items);
-            BlockExchange(temp_storage).StripedToBlocked(items, items);
-        }
-
-        /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
-        template <typename InputIteratorT, typename DefaultT>
-        __device__ __forceinline__ void Load(
-            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
-            int             valid_items,                    ///< [in] Number of valid items to load
-            DefaultT        oob_default)                    ///< [in] Default value to assign out-of-bound items
-        {
-            LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items, oob_default);
-            BlockExchange(temp_storage).StripedToBlocked(items, items);
-        }
-
-    };
-
-
-    /**
-     * BLOCK_LOAD_WARP_TRANSPOSE specialization of load helper
-     */
-    template <int DUMMY>
-    struct LoadInternal<BLOCK_LOAD_WARP_TRANSPOSE, DUMMY>
-    {
-        enum
-        {
-            WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
-        };
-
-        // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
-        CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
-
-        // BlockExchange utility type for keys
-        typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
-
-        /// Shared memory storage layout type
-        struct _TempStorage : BlockExchange::TempStorage
-        {};
-
-        /// Alias wrapper allowing storage to be unioned
-        struct TempStorage : Uninitialized<_TempStorage> {};
-
-        /// Thread reference to shared storage
-        _TempStorage &temp_storage;
-
-        /// Linear thread-id
-        int linear_tid;
-
-        /// Constructor
-        __device__ __forceinline__ LoadInternal(
-            TempStorage &temp_storage,
-            int linear_tid)
-        :
-            temp_storage(temp_storage.Alias()),
-            linear_tid(linear_tid)
-        {}
-
-        /// Load a linear segment of items from memory
-        template <typename InputIteratorT>
-        __device__ __forceinline__ void Load(
-            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            InputT          (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load{
-        {
-            LoadDirectWarpStriped(linear_tid, block_itr, items);
-            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
-        }
-
-        /// Load a linear segment of items from memory, guarded by range
-        template <typename InputIteratorT>
-        __device__ __forceinline__ void Load(
-            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
-            int             valid_items)                    ///< [in] Number of valid items to load
-        {
-            LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items);
-            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
-        }
-
-
-        /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
-        template <typename InputIteratorT, typename DefaultT>
-        __device__ __forceinline__ void Load(
-            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
-            int             valid_items,                    ///< [in] Number of valid items to load
-            DefaultT        oob_default)                    ///< [in] Default value to assign out-of-bound items
-        {
-            LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items, oob_default);
-            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
-        }
-    };
-
-
-    /**
-     * BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED specialization of load helper
-     */
-    template <int DUMMY>
-    struct LoadInternal<BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY>
-    {
-        enum
-        {
-            WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
-        };
-
-        // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
-        CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
-
-        // BlockExchange utility type for keys
-        typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, true, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
-
-        /// Shared memory storage layout type
-        struct _TempStorage : BlockExchange::TempStorage
-        {};
-
-        /// Alias wrapper allowing storage to be unioned
-        struct TempStorage : Uninitialized<_TempStorage> {};
-
-        /// Thread reference to shared storage
-        _TempStorage &temp_storage;
-
-        /// Linear thread-id
-        int linear_tid;
-
-        /// Constructor
-        __device__ __forceinline__ LoadInternal(
-            TempStorage &temp_storage,
-            int linear_tid)
-        :
-            temp_storage(temp_storage.Alias()),
-            linear_tid(linear_tid)
-        {}
-
-        /// Load a linear segment of items from memory
-        template <typename InputIteratorT>
-        __device__ __forceinline__ void Load(
-            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            InputT          (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load{
-        {
-            LoadDirectWarpStriped(linear_tid, block_itr, items);
-            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
-        }
-
-        /// Load a linear segment of items from memory, guarded by range
-        template <typename InputIteratorT>
-        __device__ __forceinline__ void Load(
-            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
-            int             valid_items)                    ///< [in] Number of valid items to load
-        {
-            LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items);
-            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
-        }
-
-
-        /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
-        template <typename InputIteratorT, typename DefaultT>
-        __device__ __forceinline__ void Load(
-            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
-            int             valid_items,                    ///< [in] Number of valid items to load
-            DefaultT        oob_default)                    ///< [in] Default value to assign out-of-bound items
-        {
-            LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items, oob_default);
-            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
-        }
-    };
-
-
-    /******************************************************************************
-     * Type definitions
-     ******************************************************************************/
-
-    /// Internal load implementation to use
-    typedef LoadInternal<ALGORITHM, 0> InternalLoad;
-
-
-    /// Shared memory storage layout type
-    typedef typename InternalLoad::TempStorage _TempStorage;
-
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-    /// Internal storage allocator
-    __device__ __forceinline__ _TempStorage& PrivateStorage()
-    {
-        __shared__ _TempStorage private_storage;
-        return private_storage;
-    }
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    /// Thread reference to shared storage
-    _TempStorage &temp_storage;
-
-    /// Linear thread-id
-    int linear_tid;
-
-public:
-
-    /// \smemstorage{BlockLoad}
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    /******************************************************************//**
-     * \name Collective constructors
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
-     */
-    __device__ __forceinline__ BlockLoad()
-    :
-        temp_storage(PrivateStorage()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.
-     */
-    __device__ __forceinline__ BlockLoad(
-        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Data movement
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Load a linear segment of items from memory.
-     *
-     * \par
-     * - \blocked
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates the loading of a linear
-     * segment of 512 integers into a "blocked" arrangement across 128 threads where each
-     * thread owns 4 consecutive items.  The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
-     * meaning memory references are efficiently coalesced using a warp-striped access
-     * pattern (after which items are locally reordered among threads).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
-     *
-     * __global__ void ExampleKernel(int *d_data, ...)
-     * {
-     *     // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
-     *     typedef cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
-     *
-     *     // Allocate shared memory for BlockLoad
-     *     __shared__ typename BlockLoad::TempStorage temp_storage;
-     *
-     *     // Load a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     BlockLoad(temp_storage).Load(d_data, thread_data);
-     *
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, 5, ...</tt>.
-     * The set of \p thread_data across the block of threads in those threads will be
-     * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
-     *
-     */
-    template <typename InputIteratorT>
-    __device__ __forceinline__ void Load(
-        InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
-        InputT          (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
-    {
-        InternalLoad(temp_storage, linear_tid).Load(block_itr, items);
-    }
-
-
-    /**
-     * \brief Load a linear segment of items from memory, guarded by range.
-     *
-     * \par
-     * - \blocked
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates the guarded loading of a linear
-     * segment of 512 integers into a "blocked" arrangement across 128 threads where each
-     * thread owns 4 consecutive items.  The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
-     * meaning memory references are efficiently coalesced using a warp-striped access
-     * pattern (after which items are locally reordered among threads).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
-     *
-     * __global__ void ExampleKernel(int *d_data, int valid_items, ...)
-     * {
-     *     // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
-     *     typedef cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
-     *
-     *     // Allocate shared memory for BlockLoad
-     *     __shared__ typename BlockLoad::TempStorage temp_storage;
-     *
-     *     // Load a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     BlockLoad(temp_storage).Load(d_data, thread_data, valid_items);
-     *
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, 5, 6...</tt> and \p valid_items is \p 5.
-     * The set of \p thread_data across the block of threads in those threads will be
-     * <tt>{ [0,1,2,3], [4,?,?,?], ..., [?,?,?,?] }</tt>, with only the first two threads
-     * being unmasked to load portions of valid data (and other items remaining unassigned).
-     *
-     */
-    template <typename InputIteratorT>
-    __device__ __forceinline__ void Load(
-        InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
-        InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
-        int             valid_items)                ///< [in] Number of valid items to load
-    {
-        InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items);
-    }
-
-
-    /**
-     * \brief Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
-     *
-     * \par
-     * - \blocked
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates the guarded loading of a linear
-     * segment of 512 integers into a "blocked" arrangement across 128 threads where each
-     * thread owns 4 consecutive items.  The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
-     * meaning memory references are efficiently coalesced using a warp-striped access
-     * pattern (after which items are locally reordered among threads).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
-     *
-     * __global__ void ExampleKernel(int *d_data, int valid_items, ...)
-     * {
-     *     // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
-     *     typedef cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
-     *
-     *     // Allocate shared memory for BlockLoad
-     *     __shared__ typename BlockLoad::TempStorage temp_storage;
-     *
-     *     // Load a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     BlockLoad(temp_storage).Load(d_data, thread_data, valid_items, -1);
-     *
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, 5, 6...</tt>,
-     * \p valid_items is \p 5, and the out-of-bounds default is \p -1.
-     * The set of \p thread_data across the block of threads in those threads will be
-     * <tt>{ [0,1,2,3], [4,-1,-1,-1], ..., [-1,-1,-1,-1] }</tt>, with only the first two threads
-     * being unmasked to load portions of valid data (and other items are assigned \p -1)
-     *
-     */
-    template <typename InputIteratorT, typename DefaultT>
-    __device__ __forceinline__ void Load(
-        InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
-        InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
-        int             valid_items,                ///< [in] Number of valid items to load
-        DefaultT        oob_default)                ///< [in] Default value to assign out-of-bound items
-    {
-        InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items, oob_default);
-    }
-
-
-    //@}  end member group
-
-};
-
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/block/block_radix_rank.cuh b/thrust/system/cuda/detail/cub/block/block_radix_rank.cuh
deleted file mode 100644
index cfd0652ec..000000000
--- a/thrust/system/cuda/detail/cub/block/block_radix_rank.cuh
+++ /dev/null
@@ -1,696 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block
- */
-
-#pragma once
-
-#include <stdint.h>
-
-#include "../thread/thread_reduce.cuh"
-#include "../thread/thread_scan.cuh"
-#include "../block/block_scan.cuh"
-#include "../util_ptx.cuh"
-#include "../util_arch.cuh"
-#include "../util_type.cuh"
-#include "../util_namespace.cuh"
-
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \brief BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
- * \ingroup BlockModule
- *
- * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
- * \tparam RADIX_BITS           The number of radix bits per digit place
- * \tparam IS_DESCENDING           Whether or not the sorted-order is high-to-low
- * \tparam MEMOIZE_OUTER_SCAN   <b>[optional]</b> Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure (default: true for architectures SM35 and newer, false otherwise).  See BlockScanAlgorithm::BLOCK_SCAN_RAKING_MEMOIZE for more details.
- * \tparam INNER_SCAN_ALGORITHM <b>[optional]</b> The cub::BlockScanAlgorithm algorithm to use (default: cub::BLOCK_SCAN_WARP_SCANS)
- * \tparam SMEM_CONFIG          <b>[optional]</b> Shared memory bank mode (default: \p cudaSharedMemBankSizeFourByte)
- * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
- * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
- *
- * \par Overview
- * Blah...
- * - Keys must be in a form suitable for radix ranking (i.e., unsigned bits).
- * - \blocked
- *
- * \par Performance Considerations
- * - \granularity
- *
- * \par Examples
- * \par
- * - <b>Example 1:</b> Simple radix rank of 32-bit integer keys
- *      \code
- *      #include <cub/cub.cuh>
- *
- *      template <int BLOCK_THREADS>
- *      __global__ void ExampleKernel(...)
- *      {
- *
- *      \endcode
- */
-template <
-    int                     BLOCK_DIM_X,
-    int                     RADIX_BITS,
-    bool                    IS_DESCENDING,
-    bool                    MEMOIZE_OUTER_SCAN      = (CUB_PTX_ARCH >= 350) ? true : false,
-    BlockScanAlgorithm      INNER_SCAN_ALGORITHM    = BLOCK_SCAN_WARP_SCANS,
-    cudaSharedMemConfig     SMEM_CONFIG             = cudaSharedMemBankSizeFourByte,
-    int                     BLOCK_DIM_Y             = 1,
-    int                     BLOCK_DIM_Z             = 1,
-    int                     PTX_ARCH                = CUB_PTX_ARCH>
-class BlockRadixRank
-{
-private:
-
-    /******************************************************************************
-     * Type definitions and constants
-     ******************************************************************************/
-
-    // Integer type for digit counters (to be packed into words of type PackedCounters)
-    typedef unsigned short DigitCounter;
-
-    // Integer type for packing DigitCounters into columns of shared memory banks
-    typedef typename If<(SMEM_CONFIG == cudaSharedMemBankSizeEightByte),
-        unsigned long long,
-        unsigned int>::Type PackedCounter;
-
-    enum
-    {
-        // The thread block size in threads
-        BLOCK_THREADS               = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-
-        RADIX_DIGITS                = 1 << RADIX_BITS,
-
-        LOG_WARP_THREADS            = CUB_LOG_WARP_THREADS(PTX_ARCH),
-        WARP_THREADS                = 1 << LOG_WARP_THREADS,
-        WARPS                       = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
-
-        BYTES_PER_COUNTER           = sizeof(DigitCounter),
-        LOG_BYTES_PER_COUNTER       = Log2<BYTES_PER_COUNTER>::VALUE,
-
-        PACKING_RATIO               = sizeof(PackedCounter) / sizeof(DigitCounter),
-        LOG_PACKING_RATIO           = Log2<PACKING_RATIO>::VALUE,
-
-        LOG_COUNTER_LANES           = CUB_MAX((RADIX_BITS - LOG_PACKING_RATIO), 0),                // Always at least one lane
-        COUNTER_LANES               = 1 << LOG_COUNTER_LANES,
-
-        // The number of packed counters per thread (plus one for padding)
-        PADDED_COUNTER_LANES        = COUNTER_LANES + 1,
-        RAKING_SEGMENT              = PADDED_COUNTER_LANES,
-    };
-
-public:
-
-    enum
-    {
-        /// Number of bin-starting offsets tracked per thread
-        BINS_TRACKED_PER_THREAD = CUB_MAX(1, (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS),
-    };
-
-private:
-
-
-    /// BlockScan type
-    typedef BlockScan<
-            PackedCounter,
-            BLOCK_DIM_X,
-            INNER_SCAN_ALGORITHM,
-            BLOCK_DIM_Y,
-            BLOCK_DIM_Z,
-            PTX_ARCH>
-        BlockScan;
-
-
-    /// Shared memory storage layout type for BlockRadixRank
-    struct __align__(16) _TempStorage
-    {
-        union Aliasable
-        {
-            DigitCounter            digit_counters[PADDED_COUNTER_LANES][BLOCK_THREADS][PACKING_RATIO];
-            PackedCounter           raking_grid[BLOCK_THREADS][RAKING_SEGMENT];
-
-        } aliasable;
-
-        // Storage for scanning local ranks
-        typename BlockScan::TempStorage block_scan;
-    };
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    /// Shared storage reference
-    _TempStorage &temp_storage;
-
-    /// Linear thread-id
-    unsigned int linear_tid;
-
-    /// Copy of raking segment, promoted to registers
-    PackedCounter cached_segment[RAKING_SEGMENT];
-
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-    /**
-     * Internal storage allocator
-     */
-    __device__ __forceinline__ _TempStorage& PrivateStorage()
-    {
-        __shared__ _TempStorage private_storage;
-        return private_storage;
-    }
-
-
-    /**
-     * Performs upsweep raking reduction, returning the aggregate
-     */
-    __device__ __forceinline__ PackedCounter Upsweep()
-    {
-        PackedCounter *smem_raking_ptr = temp_storage.aliasable.raking_grid[linear_tid];
-        PackedCounter *raking_ptr;
-
-        if (MEMOIZE_OUTER_SCAN)
-        {
-            // Copy data into registers
-            #pragma unroll
-            for (int i = 0; i < RAKING_SEGMENT; i++)
-            {
-                cached_segment[i] = smem_raking_ptr[i];
-            }
-            raking_ptr = cached_segment;
-        }
-        else
-        {
-            raking_ptr = smem_raking_ptr;
-        }
-
-        return internal::ThreadReduce<RAKING_SEGMENT>(raking_ptr, Sum());
-    }
-
-
-    /// Performs exclusive downsweep raking scan
-    __device__ __forceinline__ void ExclusiveDownsweep(
-        PackedCounter raking_partial)
-    {
-        PackedCounter *smem_raking_ptr = temp_storage.aliasable.raking_grid[linear_tid];
-
-        PackedCounter *raking_ptr = (MEMOIZE_OUTER_SCAN) ?
-            cached_segment :
-            smem_raking_ptr;
-
-        // Exclusive raking downsweep scan
-        internal::ThreadScanExclusive<RAKING_SEGMENT>(raking_ptr, raking_ptr, Sum(), raking_partial);
-
-        if (MEMOIZE_OUTER_SCAN)
-        {
-            // Copy data back to smem
-            #pragma unroll
-            for (int i = 0; i < RAKING_SEGMENT; i++)
-            {
-                smem_raking_ptr[i] = cached_segment[i];
-            }
-        }
-    }
-
-
-    /**
-     * Reset shared memory digit counters
-     */
-    __device__ __forceinline__ void ResetCounters()
-    {
-        // Reset shared memory digit counters
-        #pragma unroll
-        for (int LANE = 0; LANE < PADDED_COUNTER_LANES; LANE++)
-        {
-            *((PackedCounter*) temp_storage.aliasable.digit_counters[LANE][linear_tid]) = 0;
-        }
-    }
-
-
-    /**
-     * Block-scan prefix callback
-     */
-    struct PrefixCallBack
-    {
-        __device__ __forceinline__ PackedCounter operator()(PackedCounter block_aggregate)
-        {
-            PackedCounter block_prefix = 0;
-
-            // Propagate totals in packed fields
-            #pragma unroll
-            for (int PACKED = 1; PACKED < PACKING_RATIO; PACKED++)
-            {
-                block_prefix += block_aggregate << (sizeof(DigitCounter) * 8 * PACKED);
-            }
-
-            return block_prefix;
-        }
-    };
-
-
-    /**
-     * Scan shared memory digit counters.
-     */
-    __device__ __forceinline__ void ScanCounters()
-    {
-        // Upsweep scan
-        PackedCounter raking_partial = Upsweep();
-
-        // Compute exclusive sum
-        PackedCounter exclusive_partial;
-        PrefixCallBack prefix_call_back;
-        BlockScan(temp_storage.block_scan).ExclusiveSum(raking_partial, exclusive_partial, prefix_call_back);
-
-        // Downsweep scan with exclusive partial
-        ExclusiveDownsweep(exclusive_partial);
-    }
-
-public:
-
-    /// \smemstorage{BlockScan}
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    /******************************************************************//**
-     * \name Collective constructors
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
-     */
-    __device__ __forceinline__ BlockRadixRank()
-    :
-        temp_storage(PrivateStorage()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.
-     */
-    __device__ __forceinline__ BlockRadixRank(
-        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Raking
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Rank keys.
-     */
-    template <
-        typename        UnsignedBits,
-        int             KEYS_PER_THREAD>
-    __device__ __forceinline__ void RankKeys(
-        UnsignedBits    (&keys)[KEYS_PER_THREAD],           ///< [in] Keys for this tile
-        int             (&ranks)[KEYS_PER_THREAD],          ///< [out] For each key, the local rank within the tile
-        int             current_bit,                        ///< [in] The least-significant bit position of the current digit to extract
-        int             num_bits)                           ///< [in] The number of bits in the current digit
-    {
-        DigitCounter    thread_prefixes[KEYS_PER_THREAD];   // For each key, the count of previous keys in this tile having the same digit
-        DigitCounter*   digit_counters[KEYS_PER_THREAD];    // For each key, the byte-offset of its corresponding digit counter in smem
-
-        // Reset shared memory digit counters
-        ResetCounters();
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
-        {
-            // Get digit
-            unsigned int digit = BFE(keys[ITEM], current_bit, num_bits);
-
-            // Get sub-counter
-            unsigned int sub_counter = digit >> LOG_COUNTER_LANES;
-
-            // Get counter lane
-            unsigned int counter_lane = digit & (COUNTER_LANES - 1);
-
-            if (IS_DESCENDING)
-            {
-                sub_counter = PACKING_RATIO - 1 - sub_counter;
-                counter_lane = COUNTER_LANES - 1 - counter_lane;
-            }
-
-            // Pointer to smem digit counter
-            digit_counters[ITEM] = &temp_storage.aliasable.digit_counters[counter_lane][linear_tid][sub_counter];
-
-            // Load thread-exclusive prefix
-            thread_prefixes[ITEM] = *digit_counters[ITEM];
-
-            // Store inclusive prefix
-            *digit_counters[ITEM] = thread_prefixes[ITEM] + 1;
-        }
-
-        CTA_SYNC();
-
-        // Scan shared memory counters
-        ScanCounters();
-
-        CTA_SYNC();
-
-        // Extract the local ranks of each key
-        for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
-        {
-            // Add in thread block exclusive prefix
-            ranks[ITEM] = thread_prefixes[ITEM] + *digit_counters[ITEM];
-        }
-    }
-
-
-    /**
-     * \brief Rank keys.  For the lower \p RADIX_DIGITS threads, digit counts for each digit are provided for the corresponding thread.
-     */
-    template <
-        typename        UnsignedBits,
-        int             KEYS_PER_THREAD>
-    __device__ __forceinline__ void RankKeys(
-        UnsignedBits    (&keys)[KEYS_PER_THREAD],           ///< [in] Keys for this tile
-        int             (&ranks)[KEYS_PER_THREAD],          ///< [out] For each key, the local rank within the tile (out parameter)
-        int             current_bit,                        ///< [in] The least-significant bit position of the current digit to extract
-        int             num_bits,                           ///< [in] The number of bits in the current digit
-        int             (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD])            ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1]
-    {
-        // Rank keys
-        RankKeys(keys, ranks, current_bit, num_bits);
-
-        // Get the inclusive and exclusive digit totals corresponding to the calling thread.
-        #pragma unroll
-        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
-        {
-            int bin_idx = (linear_tid * BINS_TRACKED_PER_THREAD) + track;
-
-            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
-            {
-                if (IS_DESCENDING)
-                    bin_idx = RADIX_DIGITS - bin_idx - 1;
-
-                // Obtain ex/inclusive digit counts.  (Unfortunately these all reside in the
-                // first counter column, resulting in unavoidable bank conflicts.)
-                unsigned int counter_lane   = (bin_idx & (COUNTER_LANES - 1));
-                unsigned int sub_counter    = bin_idx >> (LOG_COUNTER_LANES);
-
-                exclusive_digit_prefix[track] = temp_storage.aliasable.digit_counters[counter_lane][0][sub_counter];
-            }
-        }
-    }
-};
-
-
-
-
-
-/**
- * Radix-rank using match.any
- */
-template <
-    int                     BLOCK_DIM_X,
-    int                     RADIX_BITS,
-    bool                    IS_DESCENDING,
-    BlockScanAlgorithm      INNER_SCAN_ALGORITHM    = BLOCK_SCAN_WARP_SCANS,
-    int                     BLOCK_DIM_Y             = 1,
-    int                     BLOCK_DIM_Z             = 1,
-    int                     PTX_ARCH                = CUB_PTX_ARCH>
-class BlockRadixRankMatch
-{
-private:
-
-    /******************************************************************************
-     * Type definitions and constants
-     ******************************************************************************/
-
-    typedef int32_t    RankT;
-    typedef int32_t    DigitCounterT;
-
-    enum
-    {
-        // The thread block size in threads
-        BLOCK_THREADS               = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-
-        RADIX_DIGITS                = 1 << RADIX_BITS,
-
-        LOG_WARP_THREADS            = CUB_LOG_WARP_THREADS(PTX_ARCH),
-        WARP_THREADS                = 1 << LOG_WARP_THREADS,
-        WARPS                       = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
-
-        PADDED_WARPS            = ((WARPS & 0x1) == 0) ?
-                                    WARPS + 1 :
-                                    WARPS,
-
-        COUNTERS                = PADDED_WARPS * RADIX_DIGITS,
-        RAKING_SEGMENT          = (COUNTERS + BLOCK_THREADS - 1) / BLOCK_THREADS,
-        PADDED_RAKING_SEGMENT   = ((RAKING_SEGMENT & 0x1) == 0) ?
-                                    RAKING_SEGMENT + 1 :
-                                    RAKING_SEGMENT,
-    };
-
-public:
-
-    enum
-    {
-        /// Number of bin-starting offsets tracked per thread
-        BINS_TRACKED_PER_THREAD = CUB_MAX(1, (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS),
-    };
-
-private:
-
-    /// BlockScan type
-    typedef BlockScan<
-            DigitCounterT,
-            BLOCK_THREADS,
-            INNER_SCAN_ALGORITHM,
-            BLOCK_DIM_Y,
-            BLOCK_DIM_Z,
-            PTX_ARCH>
-        BlockScanT;
-
-
-    /// Shared memory storage layout type for BlockRadixRank
-    struct __align__(16) _TempStorage
-    {
-        typename BlockScanT::TempStorage            block_scan;
-
-        union __align__(16) Aliasable
-        {
-            volatile DigitCounterT                  warp_digit_counters[RADIX_DIGITS][PADDED_WARPS];
-            DigitCounterT                           raking_grid[BLOCK_THREADS][PADDED_RAKING_SEGMENT];
-
-        } aliasable;
-    };
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    /// Shared storage reference
-    _TempStorage &temp_storage;
-
-    /// Linear thread-id
-    unsigned int linear_tid;
-
-
-
-public:
-
-    /// \smemstorage{BlockScan}
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    /******************************************************************//**
-     * \name Collective constructors
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.
-     */
-    __device__ __forceinline__ BlockRadixRankMatch(
-        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Raking
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Rank keys.
-     */
-    template <
-        typename        UnsignedBits,
-        int             KEYS_PER_THREAD>
-    __device__ __forceinline__ void RankKeys(
-        UnsignedBits    (&keys)[KEYS_PER_THREAD],           ///< [in] Keys for this tile
-        int             (&ranks)[KEYS_PER_THREAD],          ///< [out] For each key, the local rank within the tile
-        int             current_bit,                        ///< [in] The least-significant bit position of the current digit to extract
-        int             num_bits)                           ///< [in] The number of bits in the current digit
-    {
-        // Initialize shared digit counters
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM)
-            temp_storage.aliasable.raking_grid[linear_tid][ITEM] = 0;
-
-        CTA_SYNC();
-
-        // Each warp will strip-mine its section of input, one strip at a time
-
-        volatile DigitCounterT  *digit_counters[KEYS_PER_THREAD];
-        uint32_t                warp_id         = linear_tid >> LOG_WARP_THREADS;
-        uint32_t                lane_mask_lt    = LaneMaskLt();
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
-        {
-            // My digit
-            uint32_t digit = BFE(keys[ITEM], current_bit, num_bits);
-
-            if (IS_DESCENDING)
-                digit = RADIX_DIGITS - digit - 1;
-
-            // Mask of peers who have same digit as me
-            uint32_t peer_mask = MatchAny<RADIX_BITS>(digit);
-
-            // Pointer to smem digit counter for this key
-            digit_counters[ITEM] = &temp_storage.aliasable.warp_digit_counters[digit][warp_id];
-
-            // Number of occurrences in previous strips
-            DigitCounterT warp_digit_prefix = *digit_counters[ITEM];
-
-            // Warp-sync
-            WARP_SYNC(0xFFFFFFFF);
-
-            // Number of peers having same digit as me
-            int32_t digit_count = __popc(peer_mask);
-
-            // Number of lower-ranked peers having same digit seen so far
-            int32_t peer_digit_prefix = __popc(peer_mask & lane_mask_lt);
-
-            if (peer_digit_prefix == 0)
-            {
-                // First thread for each digit updates the shared warp counter
-                *digit_counters[ITEM] = DigitCounterT(warp_digit_prefix + digit_count);
-            }
-
-            // Warp-sync
-            WARP_SYNC(0xFFFFFFFF);
-
-            // Number of prior keys having same digit
-            ranks[ITEM] = warp_digit_prefix + DigitCounterT(peer_digit_prefix);
-        }
-
-        CTA_SYNC();
-
-        // Scan warp counters
-
-        DigitCounterT scan_counters[PADDED_RAKING_SEGMENT];
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM)
-            scan_counters[ITEM] = temp_storage.aliasable.raking_grid[linear_tid][ITEM];
-
-        BlockScanT(temp_storage.block_scan).ExclusiveSum(scan_counters, scan_counters);
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM)
-            temp_storage.aliasable.raking_grid[linear_tid][ITEM] = scan_counters[ITEM];
-
-        CTA_SYNC();
-
-        // Seed ranks with counter values from previous warps
-        #pragma unroll
-        for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
-            ranks[ITEM] += *digit_counters[ITEM];
-    }
-
-
-    /**
-     * \brief Rank keys.  For the lower \p RADIX_DIGITS threads, digit counts for each digit are provided for the corresponding thread.
-     */
-    template <
-        typename        UnsignedBits,
-        int             KEYS_PER_THREAD>
-    __device__ __forceinline__ void RankKeys(
-        UnsignedBits    (&keys)[KEYS_PER_THREAD],           ///< [in] Keys for this tile
-        int             (&ranks)[KEYS_PER_THREAD],          ///< [out] For each key, the local rank within the tile (out parameter)
-        int             current_bit,                        ///< [in] The least-significant bit position of the current digit to extract
-        int             num_bits,                           ///< [in] The number of bits in the current digit
-        int             (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD])            ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1]
-    {
-        RankKeys(keys, ranks, current_bit, num_bits);
-
-        // Get exclusive count for each digit
-        #pragma unroll
-        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
-        {
-            int bin_idx = (linear_tid * BINS_TRACKED_PER_THREAD) + track;
-
-            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
-            {
-                if (IS_DESCENDING)
-                    bin_idx = RADIX_DIGITS - bin_idx - 1;
-
-                exclusive_digit_prefix[track] = temp_storage.aliasable.warp_digit_counters[bin_idx][0];
-            }
-        }
-    }
-};
-
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/thrust/system/cuda/detail/cub/block/block_radix_sort.cuh b/thrust/system/cuda/detail/cub/block/block_radix_sort.cuh
deleted file mode 100644
index 8a54b3fb9..000000000
--- a/thrust/system/cuda/detail/cub/block/block_radix_sort.cuh
+++ /dev/null
@@ -1,863 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * The cub::BlockRadixSort class provides [<em>collective</em>](index.html#sec0) methods for radix sorting of items partitioned across a CUDA thread block.
- */
-
-
-#pragma once
-
-#include "block_exchange.cuh"
-#include "block_radix_rank.cuh"
-#include "../util_ptx.cuh"
-#include "../util_arch.cuh"
-#include "../util_type.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \brief The BlockRadixSort class provides [<em>collective</em>](index.html#sec0) methods for sorting items partitioned across a CUDA thread block using a radix sorting method.  ![](sorting_logo.png)
- * \ingroup BlockModule
- *
- * \tparam KeyT                 KeyT type
- * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
- * \tparam ITEMS_PER_THREAD     The number of items per thread
- * \tparam ValueT               <b>[optional]</b> ValueT type (default: cub::NullType, which indicates a keys-only sort)
- * \tparam RADIX_BITS           <b>[optional]</b> The number of radix bits per digit place (default: 4 bits)
- * \tparam MEMOIZE_OUTER_SCAN   <b>[optional]</b> Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure (default: true for architectures SM35 and newer, false otherwise).
- * \tparam INNER_SCAN_ALGORITHM <b>[optional]</b> The cub::BlockScanAlgorithm algorithm to use (default: cub::BLOCK_SCAN_WARP_SCANS)
- * \tparam SMEM_CONFIG          <b>[optional]</b> Shared memory bank mode (default: \p cudaSharedMemBankSizeFourByte)
- * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
- * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
- *
- * \par Overview
- * - The [<em>radix sorting method</em>](http://en.wikipedia.org/wiki/Radix_sort) arranges
- *   items into ascending order.  It relies upon a positional representation for
- *   keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits,
- *   characters, etc.) specified from least-significant to most-significant.  For a
- *   given input sequence of keys and a set of rules specifying a total ordering
- *   of the symbolic alphabet, the radix sorting method produces a lexicographic
- *   ordering of those keys.
- * - BlockRadixSort can sort all of the built-in C++ numeric primitive types
- *   (<tt>unsigned char</tt>, \p int, \p double, etc.) as well as CUDA's \p __half
- *   half-precision floating-point type. Within each key, the implementation treats fixed-length
- *   bit-sequences of \p RADIX_BITS as radix digit places.  Although the direct radix sorting
- *   method can only be applied to unsigned integral types, BlockRadixSort
- *   is able to sort signed and floating-point types via simple bit-wise transformations
- *   that ensure lexicographic key ordering.
- * - \rowmajor
- *
- * \par Performance Considerations
- * - \granularity
- *
- * \par A Simple Example
- * \blockcollective{BlockRadixSort}
- * \par
- * The code snippet below illustrates a sort of 512 integer keys that
- * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
- * where each thread owns 4 consecutive items.
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
- *
- * __global__ void ExampleKernel(...)
- * {
- *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer items each
- *     typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
- *
- *     // Allocate shared memory for BlockRadixSort
- *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
- *
- *     // Obtain a segment of consecutive items that are blocked across threads
- *     int thread_keys[4];
- *     ...
- *
- *     // Collectively sort the keys
- *     BlockRadixSort(temp_storage).Sort(thread_keys);
- *
- *     ...
- * \endcode
- * \par
- * Suppose the set of input \p thread_keys across the block of threads is
- * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
- * corresponding output \p thread_keys in those threads will be
- * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
- *
- */
-template <
-    typename                KeyT,
-    int                     BLOCK_DIM_X,
-    int                     ITEMS_PER_THREAD,
-    typename                ValueT                   = NullType,
-    int                     RADIX_BITS              = 4,
-    bool                    MEMOIZE_OUTER_SCAN      = (CUB_PTX_ARCH >= 350) ? true : false,
-    BlockScanAlgorithm      INNER_SCAN_ALGORITHM    = BLOCK_SCAN_WARP_SCANS,
-    cudaSharedMemConfig     SMEM_CONFIG             = cudaSharedMemBankSizeFourByte,
-    int                     BLOCK_DIM_Y             = 1,
-    int                     BLOCK_DIM_Z             = 1,
-    int                     PTX_ARCH                = CUB_PTX_ARCH>
-class BlockRadixSort
-{
-private:
-
-    /******************************************************************************
-     * Constants and type definitions
-     ******************************************************************************/
-
-    enum
-    {
-        // The thread block size in threads
-        BLOCK_THREADS               = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-
-        // Whether or not there are values to be trucked along with keys
-        KEYS_ONLY                   = Equals<ValueT, NullType>::VALUE,
-    };
-
-    // KeyT traits and unsigned bits type
-    typedef Traits<KeyT>                        KeyTraits;
-    typedef typename KeyTraits::UnsignedBits    UnsignedBits;
-
-    /// Ascending BlockRadixRank utility type
-    typedef BlockRadixRank<
-            BLOCK_DIM_X,
-            RADIX_BITS,
-            false,
-            MEMOIZE_OUTER_SCAN,
-            INNER_SCAN_ALGORITHM,
-            SMEM_CONFIG,
-            BLOCK_DIM_Y,
-            BLOCK_DIM_Z,
-            PTX_ARCH>
-        AscendingBlockRadixRank;
-
-    /// Descending BlockRadixRank utility type
-    typedef BlockRadixRank<
-            BLOCK_DIM_X,
-            RADIX_BITS,
-            true,
-            MEMOIZE_OUTER_SCAN,
-            INNER_SCAN_ALGORITHM,
-            SMEM_CONFIG,
-            BLOCK_DIM_Y,
-            BLOCK_DIM_Z,
-            PTX_ARCH>
-        DescendingBlockRadixRank;
-
-    /// BlockExchange utility type for keys
-    typedef BlockExchange<KeyT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchangeKeys;
-
-    /// BlockExchange utility type for values
-    typedef BlockExchange<ValueT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchangeValues;
-
-    /// Shared memory storage layout type
-    union _TempStorage
-    {
-        typename AscendingBlockRadixRank::TempStorage  asending_ranking_storage;
-        typename DescendingBlockRadixRank::TempStorage descending_ranking_storage;
-        typename BlockExchangeKeys::TempStorage        exchange_keys;
-        typename BlockExchangeValues::TempStorage      exchange_values;
-    };
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    /// Shared storage reference
-    _TempStorage &temp_storage;
-
-    /// Linear thread-id
-    unsigned int linear_tid;
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-    /// Internal storage allocator
-    __device__ __forceinline__ _TempStorage& PrivateStorage()
-    {
-        __shared__ _TempStorage private_storage;
-        return private_storage;
-    }
-
-    /// Rank keys (specialized for ascending sort)
-    __device__ __forceinline__ void RankKeys(
-        UnsignedBits    (&unsigned_keys)[ITEMS_PER_THREAD],
-        int             (&ranks)[ITEMS_PER_THREAD],
-        int             begin_bit,
-        int             pass_bits,
-        Int2Type<false> /*is_descending*/)
-    {
-        AscendingBlockRadixRank(temp_storage.asending_ranking_storage).RankKeys(
-            unsigned_keys,
-            ranks,
-            begin_bit,
-            pass_bits);
-    }
-
-    /// Rank keys (specialized for descending sort)
-    __device__ __forceinline__ void RankKeys(
-        UnsignedBits    (&unsigned_keys)[ITEMS_PER_THREAD],
-        int             (&ranks)[ITEMS_PER_THREAD],
-        int             begin_bit,
-        int             pass_bits,
-        Int2Type<true>  /*is_descending*/)
-    {
-        DescendingBlockRadixRank(temp_storage.descending_ranking_storage).RankKeys(
-            unsigned_keys,
-            ranks,
-            begin_bit,
-            pass_bits);
-    }
-
-    /// ExchangeValues (specialized for key-value sort, to-blocked arrangement)
-    __device__ __forceinline__ void ExchangeValues(
-        ValueT          (&values)[ITEMS_PER_THREAD],
-        int             (&ranks)[ITEMS_PER_THREAD],
-        Int2Type<false> /*is_keys_only*/,
-        Int2Type<true>  /*is_blocked*/)
-    {
-        CTA_SYNC();
-
-        // Exchange values through shared memory in blocked arrangement
-        BlockExchangeValues(temp_storage.exchange_values).ScatterToBlocked(values, ranks);
-    }
-
-    /// ExchangeValues (specialized for key-value sort, to-striped arrangement)
-    __device__ __forceinline__ void ExchangeValues(
-        ValueT          (&values)[ITEMS_PER_THREAD],
-        int             (&ranks)[ITEMS_PER_THREAD],
-        Int2Type<false> /*is_keys_only*/,
-        Int2Type<false> /*is_blocked*/)
-    {
-        CTA_SYNC();
-
-        // Exchange values through shared memory in blocked arrangement
-        BlockExchangeValues(temp_storage.exchange_values).ScatterToStriped(values, ranks);
-    }
-
-    /// ExchangeValues (specialized for keys-only sort)
-    template <int IS_BLOCKED>
-    __device__ __forceinline__ void ExchangeValues(
-        ValueT                  (&/*values*/)[ITEMS_PER_THREAD],
-        int                     (&/*ranks*/)[ITEMS_PER_THREAD],
-        Int2Type<true>          /*is_keys_only*/,
-        Int2Type<IS_BLOCKED>    /*is_blocked*/)
-    {}
-
-    /// Sort blocked arrangement
-    template <int DESCENDING, int KEYS_ONLY>
-    __device__ __forceinline__ void SortBlocked(
-        KeyT                    (&keys)[ITEMS_PER_THREAD],          ///< Keys to sort
-        ValueT                  (&values)[ITEMS_PER_THREAD],        ///< Values to sort
-        int                     begin_bit,                          ///< The beginning (least-significant) bit index needed for key comparison
-        int                     end_bit,                            ///< The past-the-end (most-significant) bit index needed for key comparison
-        Int2Type<DESCENDING>    is_descending,                      ///< Tag whether is a descending-order sort
-        Int2Type<KEYS_ONLY>     is_keys_only)                       ///< Tag whether is keys-only sort
-    {
-        UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD] =
-            reinterpret_cast<UnsignedBits (&)[ITEMS_PER_THREAD]>(keys);
-
-        // Twiddle bits if necessary
-        #pragma unroll
-        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
-        {
-            unsigned_keys[KEY] = KeyTraits::TwiddleIn(unsigned_keys[KEY]);
-        }
-
-        // Radix sorting passes
-        while (true)
-        {
-            int pass_bits = CUB_MIN(RADIX_BITS, end_bit - begin_bit);
-
-            // Rank the blocked keys
-            int ranks[ITEMS_PER_THREAD];
-            RankKeys(unsigned_keys, ranks, begin_bit, pass_bits, is_descending);
-            begin_bit += RADIX_BITS;
-
-            CTA_SYNC();
-
-            // Exchange keys through shared memory in blocked arrangement
-            BlockExchangeKeys(temp_storage.exchange_keys).ScatterToBlocked(keys, ranks);
-
-            // Exchange values through shared memory in blocked arrangement
-            ExchangeValues(values, ranks, is_keys_only, Int2Type<true>());
-
-            // Quit if done
-            if (begin_bit >= end_bit) break;
-
-            CTA_SYNC();
-        }
-
-        // Untwiddle bits if necessary
-        #pragma unroll
-        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
-        {
-            unsigned_keys[KEY] = KeyTraits::TwiddleOut(unsigned_keys[KEY]);
-        }
-    }
-
-public:
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-    /// Sort blocked -> striped arrangement
-    template <int DESCENDING, int KEYS_ONLY>
-    __device__ __forceinline__ void SortBlockedToStriped(
-        KeyT                    (&keys)[ITEMS_PER_THREAD],          ///< Keys to sort
-        ValueT                  (&values)[ITEMS_PER_THREAD],        ///< Values to sort
-        int                     begin_bit,                          ///< The beginning (least-significant) bit index needed for key comparison
-        int                     end_bit,                            ///< The past-the-end (most-significant) bit index needed for key comparison
-        Int2Type<DESCENDING>    is_descending,                      ///< Tag whether is a descending-order sort
-        Int2Type<KEYS_ONLY>     is_keys_only)                       ///< Tag whether is keys-only sort
-    {
-        UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD] =
-            reinterpret_cast<UnsignedBits (&)[ITEMS_PER_THREAD]>(keys);
-
-        // Twiddle bits if necessary
-        #pragma unroll
-        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
-        {
-            unsigned_keys[KEY] = KeyTraits::TwiddleIn(unsigned_keys[KEY]);
-        }
-
-        // Radix sorting passes
-        while (true)
-        {
-            int pass_bits = CUB_MIN(RADIX_BITS, end_bit - begin_bit);
-
-            // Rank the blocked keys
-            int ranks[ITEMS_PER_THREAD];
-            RankKeys(unsigned_keys, ranks, begin_bit, pass_bits, is_descending);
-            begin_bit += RADIX_BITS;
-
-            CTA_SYNC();
-
-            // Check if this is the last pass
-            if (begin_bit >= end_bit)
-            {
-                // Last pass exchanges keys through shared memory in striped arrangement
-                BlockExchangeKeys(temp_storage.exchange_keys).ScatterToStriped(keys, ranks);
-
-                // Last pass exchanges through shared memory in striped arrangement
-                ExchangeValues(values, ranks, is_keys_only, Int2Type<false>());
-
-                // Quit
-                break;
-            }
-
-            // Exchange keys through shared memory in blocked arrangement
-            BlockExchangeKeys(temp_storage.exchange_keys).ScatterToBlocked(keys, ranks);
-
-            // Exchange values through shared memory in blocked arrangement
-            ExchangeValues(values, ranks, is_keys_only, Int2Type<true>());
-
-            CTA_SYNC();
-        }
-
-        // Untwiddle bits if necessary
-        #pragma unroll
-        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
-        {
-            unsigned_keys[KEY] = KeyTraits::TwiddleOut(unsigned_keys[KEY]);
-        }
-    }
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-    /// \smemstorage{BlockRadixSort}
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    /******************************************************************//**
-     * \name Collective constructors
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
-     */
-    __device__ __forceinline__ BlockRadixSort()
-    :
-        temp_storage(PrivateStorage()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.
-     */
-    __device__ __forceinline__ BlockRadixSort(
-        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Sorting (blocked arrangements)
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Performs an ascending block-wide radix sort over a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys.
-     *
-     * \par
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a sort of 512 integer keys that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive keys.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
-     *     typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
-     *
-     *     // Allocate shared memory for BlockRadixSort
-     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_keys[4];
-     *     ...
-     *
-     *     // Collectively sort the keys
-     *     BlockRadixSort(temp_storage).Sort(thread_keys);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_keys across the block of threads is
-     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.
-     * The corresponding output \p thread_keys in those threads will be
-     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
-     */
-    __device__ __forceinline__ void Sort(
-        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
-        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
-        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
-    {
-        NullType values[ITEMS_PER_THREAD];
-
-        SortBlocked(keys, values, begin_bit, end_bit, Int2Type<false>(), Int2Type<KEYS_ONLY>());
-    }
-
-
-    /**
-     * \brief Performs an ascending block-wide radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys and values.
-     *
-     * \par
-     * - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
-     *   more than one tile of values, simply perform a key-value sort of the keys paired
-     *   with a temporary value array that enumerates the key indices.  The reordered indices
-     *   can then be used as a gather-vector for exchanging other associated tile data through
-     *   shared memory.
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a sort of 512 integer keys and values that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive pairs.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
-     *     typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort;
-     *
-     *     // Allocate shared memory for BlockRadixSort
-     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_keys[4];
-     *     int thread_values[4];
-     *     ...
-     *
-     *     // Collectively sort the keys and values among block threads
-     *     BlockRadixSort(temp_storage).Sort(thread_keys, thread_values);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_keys across the block of threads is
-     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
-     * corresponding output \p thread_keys in those threads will be
-     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
-     *
-     */
-    __device__ __forceinline__ void Sort(
-        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
-        ValueT  (&values)[ITEMS_PER_THREAD],        ///< [in-out] Values to sort
-        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
-        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
-    {
-        SortBlocked(keys, values, begin_bit, end_bit, Int2Type<false>(), Int2Type<KEYS_ONLY>());
-    }
-
-    /**
-     * \brief Performs a descending block-wide radix sort over a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys.
-     *
-     * \par
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a sort of 512 integer keys that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive keys.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
-     *     typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
-     *
-     *     // Allocate shared memory for BlockRadixSort
-     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_keys[4];
-     *     ...
-     *
-     *     // Collectively sort the keys
-     *     BlockRadixSort(temp_storage).Sort(thread_keys);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_keys across the block of threads is
-     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.
-     * The corresponding output \p thread_keys in those threads will be
-     * <tt>{ [511,510,509,508], [11,10,9,8], [7,6,5,4], ..., [3,2,1,0] }</tt>.
-     */
-    __device__ __forceinline__ void SortDescending(
-        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
-        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
-        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
-    {
-        NullType values[ITEMS_PER_THREAD];
-
-        SortBlocked(keys, values, begin_bit, end_bit, Int2Type<true>(), Int2Type<KEYS_ONLY>());
-    }
-
-
-    /**
-     * \brief Performs a descending block-wide radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys and values.
-     *
-     * \par
-     * - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
-     *   more than one tile of values, simply perform a key-value sort of the keys paired
-     *   with a temporary value array that enumerates the key indices.  The reordered indices
-     *   can then be used as a gather-vector for exchanging other associated tile data through
-     *   shared memory.
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a sort of 512 integer keys and values that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive pairs.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
-     *     typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort;
-     *
-     *     // Allocate shared memory for BlockRadixSort
-     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_keys[4];
-     *     int thread_values[4];
-     *     ...
-     *
-     *     // Collectively sort the keys and values among block threads
-     *     BlockRadixSort(temp_storage).Sort(thread_keys, thread_values);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_keys across the block of threads is
-     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
-     * corresponding output \p thread_keys in those threads will be
-     * <tt>{ [511,510,509,508], [11,10,9,8], [7,6,5,4], ..., [3,2,1,0] }</tt>.
-     *
-     */
-    __device__ __forceinline__ void SortDescending(
-        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
-        ValueT  (&values)[ITEMS_PER_THREAD],        ///< [in-out] Values to sort
-        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
-        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
-    {
-        SortBlocked(keys, values, begin_bit, end_bit, Int2Type<true>(), Int2Type<KEYS_ONLY>());
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Sorting (blocked arrangement -> striped arrangement)
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Performs an ascending radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys, leaving them in a [<em>striped arrangement</em>](index.html#sec5sec3).
-     *
-     * \par
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a sort of 512 integer keys that
-     * are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive keys.  The final partitioning is striped.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
-     *     typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
-     *
-     *     // Allocate shared memory for BlockRadixSort
-     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_keys[4];
-     *     ...
-     *
-     *     // Collectively sort the keys
-     *     BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_keys across the block of threads is
-     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
-     * corresponding output \p thread_keys in those threads will be
-     * <tt>{ [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }</tt>.
-     *
-     */
-    __device__ __forceinline__ void SortBlockedToStriped(
-        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
-        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
-        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
-    {
-        NullType values[ITEMS_PER_THREAD];
-
-        SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type<false>(), Int2Type<KEYS_ONLY>());
-    }
-
-
-    /**
-     * \brief Performs an ascending radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys and values, leaving them in a [<em>striped arrangement</em>](index.html#sec5sec3).
-     *
-     * \par
-     * - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
-     *   more than one tile of values, simply perform a key-value sort of the keys paired
-     *   with a temporary value array that enumerates the key indices.  The reordered indices
-     *   can then be used as a gather-vector for exchanging other associated tile data through
-     *   shared memory.
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a sort of 512 integer keys and values that
-     * are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive pairs.  The final partitioning is striped.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
-     *     typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort;
-     *
-     *     // Allocate shared memory for BlockRadixSort
-     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_keys[4];
-     *     int thread_values[4];
-     *     ...
-     *
-     *     // Collectively sort the keys and values among block threads
-     *     BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys, thread_values);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_keys across the block of threads is
-     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
-     * corresponding output \p thread_keys in those threads will be
-     * <tt>{ [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }</tt>.
-     *
-     */
-    __device__ __forceinline__ void SortBlockedToStriped(
-        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
-        ValueT  (&values)[ITEMS_PER_THREAD],        ///< [in-out] Values to sort
-        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
-        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
-    {
-        SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type<false>(), Int2Type<KEYS_ONLY>());
-    }
-
-
-    /**
-     * \brief Performs a descending radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys, leaving them in a [<em>striped arrangement</em>](index.html#sec5sec3).
-     *
-     * \par
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a sort of 512 integer keys that
-     * are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive keys.  The final partitioning is striped.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
-     *     typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
-     *
-     *     // Allocate shared memory for BlockRadixSort
-     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_keys[4];
-     *     ...
-     *
-     *     // Collectively sort the keys
-     *     BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_keys across the block of threads is
-     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
-     * corresponding output \p thread_keys in those threads will be
-     * <tt>{ [511,383,255,127], [386,258,130,2], [385,257,128,1], ..., [384,256,128,0] }</tt>.
-     *
-     */
-    __device__ __forceinline__ void SortDescendingBlockedToStriped(
-        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
-        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
-        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
-    {
-        NullType values[ITEMS_PER_THREAD];
-
-        SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type<true>(), Int2Type<KEYS_ONLY>());
-    }
-
-
-    /**
-     * \brief Performs a descending radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys and values, leaving them in a [<em>striped arrangement</em>](index.html#sec5sec3).
-     *
-     * \par
-     * - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
-     *   more than one tile of values, simply perform a key-value sort of the keys paired
-     *   with a temporary value array that enumerates the key indices.  The reordered indices
-     *   can then be used as a gather-vector for exchanging other associated tile data through
-     *   shared memory.
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a sort of 512 integer keys and values that
-     * are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive pairs.  The final partitioning is striped.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
-     *     typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort;
-     *
-     *     // Allocate shared memory for BlockRadixSort
-     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_keys[4];
-     *     int thread_values[4];
-     *     ...
-     *
-     *     // Collectively sort the keys and values among block threads
-     *     BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys, thread_values);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_keys across the block of threads is
-     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
-     * corresponding output \p thread_keys in those threads will be
-     * <tt>{ [511,383,255,127], [386,258,130,2], [385,257,128,1], ..., [384,256,128,0] }</tt>.
-     *
-     */
-    __device__ __forceinline__ void SortDescendingBlockedToStriped(
-        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
-        ValueT  (&values)[ITEMS_PER_THREAD],        ///< [in-out] Values to sort
-        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
-        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
-    {
-        SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type<true>(), Int2Type<KEYS_ONLY>());
-    }
-
-
-    //@}  end member group
-
-};
-
-/**
- * \example example_block_radix_sort.cu
- */
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/block/block_raking_layout.cuh b/thrust/system/cuda/detail/cub/block/block_raking_layout.cuh
deleted file mode 100644
index 9cf4ffa97..000000000
--- a/thrust/system/cuda/detail/cub/block/block_raking_layout.cuh
+++ /dev/null
@@ -1,152 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::BlockRakingLayout provides a conflict-free shared memory layout abstraction for warp-raking across thread block data.
- */
-
-
-#pragma once
-
-#include "../util_macro.cuh"
-#include "../util_arch.cuh"
-#include "../util_type.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \brief BlockRakingLayout provides a conflict-free shared memory layout abstraction for 1D raking across thread block data.    ![](raking.png)
- * \ingroup BlockModule
- *
- * \par Overview
- * This type facilitates a shared memory usage pattern where a block of CUDA
- * threads places elements into shared memory and then reduces the active
- * parallelism to one "raking" warp of threads for serially aggregating consecutive
- * sequences of shared items.  Padding is inserted to eliminate bank conflicts
- * (for most data types).
- *
- * \tparam T                        The data type to be exchanged.
- * \tparam BLOCK_THREADS            The thread block size in threads.
- * \tparam PTX_ARCH                 <b>[optional]</b> \ptxversion
- */
-template <
-    typename    T,
-    int         BLOCK_THREADS,
-    int         PTX_ARCH = CUB_PTX_ARCH>
-struct BlockRakingLayout
-{
-    //---------------------------------------------------------------------
-    // Constants and type definitions
-    //---------------------------------------------------------------------
-
-    enum
-    {
-        /// The total number of elements that need to be cooperatively reduced
-        SHARED_ELEMENTS = BLOCK_THREADS,
-
-        /// Maximum number of warp-synchronous raking threads
-        MAX_RAKING_THREADS = CUB_MIN(BLOCK_THREADS, CUB_WARP_THREADS(PTX_ARCH)),
-
-        /// Number of raking elements per warp-synchronous raking thread (rounded up)
-        SEGMENT_LENGTH = (SHARED_ELEMENTS + MAX_RAKING_THREADS - 1) / MAX_RAKING_THREADS,
-
-        /// Never use a raking thread that will have no valid data (e.g., when BLOCK_THREADS is 62 and SEGMENT_LENGTH is 2, we should only use 31 raking threads)
-        RAKING_THREADS = (SHARED_ELEMENTS + SEGMENT_LENGTH - 1) / SEGMENT_LENGTH,
-
-        /// Whether we will have bank conflicts (technically we should find out if the GCD is > 1)
-        HAS_CONFLICTS = (CUB_SMEM_BANKS(PTX_ARCH) % SEGMENT_LENGTH == 0),
-
-        /// Degree of bank conflicts (e.g., 4-way)
-        CONFLICT_DEGREE = (HAS_CONFLICTS) ?
-            (MAX_RAKING_THREADS * SEGMENT_LENGTH) / CUB_SMEM_BANKS(PTX_ARCH) :
-            1,
-
-        /// Pad each segment length with one element if segment length is not relatively prime to warp size and can't be optimized as a vector load
-        USE_SEGMENT_PADDING = ((SEGMENT_LENGTH & 1) == 0) && (SEGMENT_LENGTH > 2),
-
-        /// Total number of elements in the raking grid
-        GRID_ELEMENTS = RAKING_THREADS * (SEGMENT_LENGTH + USE_SEGMENT_PADDING),
-
-        /// Whether or not we need bounds checking during raking (the number of reduction elements is not a multiple of the number of raking threads)
-        UNGUARDED = (SHARED_ELEMENTS % RAKING_THREADS == 0),
-    };
-
-
-    /**
-     * \brief Shared memory storage type
-     */
-    struct __align__(16) _TempStorage
-    {
-        T buff[BlockRakingLayout::GRID_ELEMENTS];
-    };
-
-    /// Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    /**
-     * \brief Returns the location for the calling thread to place data into the grid
-     */
-    static __device__ __forceinline__ T* PlacementPtr(
-        TempStorage &temp_storage,
-        unsigned int linear_tid)
-    {
-        // Offset for partial
-        unsigned int offset = linear_tid;
-
-        // Add in one padding element for every segment
-        if (USE_SEGMENT_PADDING > 0)
-        {
-            offset += offset / SEGMENT_LENGTH;
-        }
-
-        // Incorporating a block of padding partials every shared memory segment
-        return temp_storage.Alias().buff + offset;
-    }
-
-
-    /**
-     * \brief Returns the location for the calling thread to begin sequential raking
-     */
-    static __device__ __forceinline__ T* RakingPtr(
-        TempStorage &temp_storage,
-        unsigned int linear_tid)
-    {
-        return temp_storage.Alias().buff + (linear_tid * (SEGMENT_LENGTH + USE_SEGMENT_PADDING));
-    }
-};
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/block/block_reduce.cuh b/thrust/system/cuda/detail/cub/block/block_reduce.cuh
deleted file mode 100644
index 12a79ecea..000000000
--- a/thrust/system/cuda/detail/cub/block/block_reduce.cuh
+++ /dev/null
@@ -1,607 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * The cub::BlockReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread block.
- */
-
-#pragma once
-
-#include "specializations/block_reduce_raking.cuh"
-#include "specializations/block_reduce_raking_commutative_only.cuh"
-#include "specializations/block_reduce_warp_reductions.cuh"
-#include "../util_ptx.cuh"
-#include "../util_type.cuh"
-#include "../thread/thread_operators.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-
-/******************************************************************************
- * Algorithmic variants
- ******************************************************************************/
-
-/**
- * BlockReduceAlgorithm enumerates alternative algorithms for parallel
- * reduction across a CUDA thread block.
- */
-enum BlockReduceAlgorithm
-{
-
-    /**
-     * \par Overview
-     * An efficient "raking" reduction algorithm that only supports commutative
-     * reduction operators (true for most operations, e.g., addition).
-     *
-     * \par
-     * Execution is comprised of three phases:
-     * -# Upsweep sequential reduction in registers (if threads contribute more
-     *    than one input each).  Threads in warps other than the first warp place
-     *    their partial reductions into shared memory.
-     * -# Upsweep sequential reduction in shared memory.  Threads within the first
-     *    warp continue to accumulate by raking across segments of shared partial reductions
-     * -# A warp-synchronous Kogge-Stone style reduction within the raking warp.
-     *
-     * \par
-     * \image html block_reduce.png
-     * <div class="centercaption">\p BLOCK_REDUCE_RAKING data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
-     *
-     * \par Performance Considerations
-     * - This variant performs less communication than BLOCK_REDUCE_RAKING_NON_COMMUTATIVE
-     *   and is preferable when the reduction operator is commutative.  This variant
-     *   applies fewer reduction operators  than BLOCK_REDUCE_WARP_REDUCTIONS, and can provide higher overall
-     *   throughput across the GPU when suitably occupied.  However, turn-around latency may be
-     *   higher than to BLOCK_REDUCE_WARP_REDUCTIONS and thus less-desirable
-     *   when the GPU is under-occupied.
-     */
-    BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY,
-
-
-    /**
-     * \par Overview
-     * An efficient "raking" reduction algorithm that supports commutative
-     * (e.g., addition) and non-commutative (e.g., string concatenation) reduction
-     * operators. \blocked.
-     *
-     * \par
-     * Execution is comprised of three phases:
-     * -# Upsweep sequential reduction in registers (if threads contribute more
-     *    than one input each).  Each thread then places the partial reduction
-     *    of its item(s) into shared memory.
-     * -# Upsweep sequential reduction in shared memory.  Threads within a
-     *    single warp rake across segments of shared partial reductions.
-     * -# A warp-synchronous Kogge-Stone style reduction within the raking warp.
-     *
-     * \par
-     * \image html block_reduce.png
-     * <div class="centercaption">\p BLOCK_REDUCE_RAKING data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
-     *
-     * \par Performance Considerations
-     * - This variant performs more communication than BLOCK_REDUCE_RAKING
-     *   and is only preferable when the reduction operator is non-commutative.  This variant
-     *   applies fewer reduction operators than BLOCK_REDUCE_WARP_REDUCTIONS, and can provide higher overall
-     *   throughput across the GPU when suitably occupied.  However, turn-around latency may be
-     *   higher than to BLOCK_REDUCE_WARP_REDUCTIONS and thus less-desirable
-     *   when the GPU is under-occupied.
-     */
-    BLOCK_REDUCE_RAKING,
-
-
-    /**
-     * \par Overview
-     * A quick "tiled warp-reductions" reduction algorithm that supports commutative
-     * (e.g., addition) and non-commutative (e.g., string concatenation) reduction
-     * operators.
-     *
-     * \par
-     * Execution is comprised of four phases:
-     * -# Upsweep sequential reduction in registers (if threads contribute more
-     *    than one input each).  Each thread then places the partial reduction
-     *    of its item(s) into shared memory.
-     * -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style
-     *    reduction within each warp.
-     * -# A propagation phase where the warp reduction outputs in each warp are
-     *    updated with the aggregate from each preceding warp.
-     *
-     * \par
-     * \image html block_scan_warpscans.png
-     * <div class="centercaption">\p BLOCK_REDUCE_WARP_REDUCTIONS data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
-     *
-     * \par Performance Considerations
-     * - This variant applies more reduction operators than BLOCK_REDUCE_RAKING
-     *   or BLOCK_REDUCE_RAKING_NON_COMMUTATIVE, which may result in lower overall
-     *   throughput across the GPU.  However turn-around latency may be lower and
-     *   thus useful when the GPU is under-occupied.
-     */
-    BLOCK_REDUCE_WARP_REDUCTIONS,
-};
-
-
-/******************************************************************************
- * Block reduce
- ******************************************************************************/
-
-/**
- * \brief The BlockReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread block. ![](reduce_logo.png)
- * \ingroup BlockModule
- *
- * \tparam T                Data type being reduced
- * \tparam BLOCK_DIM_X      The thread block length in threads along the X dimension
- * \tparam ALGORITHM        <b>[optional]</b> cub::BlockReduceAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_REDUCE_WARP_REDUCTIONS)
- * \tparam BLOCK_DIM_Y      <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
- * \tparam BLOCK_DIM_Z      <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- * \tparam PTX_ARCH         <b>[optional]</b> \ptxversion
- *
- * \par Overview
- * - A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
- *   uses a binary combining operator to compute a single aggregate from a list of input elements.
- * - \rowmajor
- * - BlockReduce can be optionally specialized by algorithm to accommodate different latency/throughput workload profiles:
- *   -# <b>cub::BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY</b>.  An efficient "raking" reduction algorithm that only supports commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm)
- *   -# <b>cub::BLOCK_REDUCE_RAKING</b>.  An efficient "raking" reduction algorithm that supports commutative and non-commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm)
- *   -# <b>cub::BLOCK_REDUCE_WARP_REDUCTIONS</b>.  A quick "tiled warp-reductions" reduction algorithm that supports commutative and non-commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm)
- *
- * \par Performance Considerations
- * - \granularity
- * - Very efficient (only one synchronization barrier).
- * - Incurs zero bank conflicts for most types
- * - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
- *   - Summation (<b><em>vs.</em></b> generic reduction)
- *   - \p BLOCK_THREADS is a multiple of the architecture's warp size
- *   - Every thread has a valid input (i.e., full <b><em>vs.</em></b> partial-tiles)
- * - See cub::BlockReduceAlgorithm for performance details regarding algorithmic alternatives
- *
- * \par A Simple Example
- * \blockcollective{BlockReduce}
- * \par
- * The code snippet below illustrates a sum reduction of 512 integer items that
- * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
- * where each thread owns 4 consecutive items.
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
- *
- * __global__ void ExampleKernel(...)
- * {
- *     // Specialize BlockReduce for a 1D block of 128 threads on type int
- *     typedef cub::BlockReduce<int, 128> BlockReduce;
- *
- *     // Allocate shared memory for BlockReduce
- *     __shared__ typename BlockReduce::TempStorage temp_storage;
- *
- *     // Obtain a segment of consecutive items that are blocked across threads
- *     int thread_data[4];
- *     ...
- *
- *     // Compute the block-wide sum for thread0
- *     int aggregate = BlockReduce(temp_storage).Sum(thread_data);
- *
- * \endcode
- *
- */
-template <
-    typename                T,
-    int                     BLOCK_DIM_X,
-    BlockReduceAlgorithm    ALGORITHM       = BLOCK_REDUCE_WARP_REDUCTIONS,
-    int                     BLOCK_DIM_Y     = 1,
-    int                     BLOCK_DIM_Z     = 1,
-    int                     PTX_ARCH        = CUB_PTX_ARCH>
-class BlockReduce
-{
-private:
-
-    /******************************************************************************
-     * Constants and type definitions
-     ******************************************************************************/
-
-    /// Constants
-    enum
-    {
-        /// The thread block size in threads
-        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-    };
-
-    typedef BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH>           WarpReductions;
-    typedef BlockReduceRakingCommutativeOnly<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH>    RakingCommutativeOnly;
-    typedef BlockReduceRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH>                   Raking;
-
-    /// Internal specialization type
-    typedef typename If<(ALGORITHM == BLOCK_REDUCE_WARP_REDUCTIONS),
-        WarpReductions,
-        typename If<(ALGORITHM == BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY),
-            RakingCommutativeOnly,
-            Raking>::Type>::Type InternalBlockReduce;     // BlockReduceRaking
-
-    /// Shared memory storage layout type for BlockReduce
-    typedef typename InternalBlockReduce::TempStorage _TempStorage;
-
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-    /// Internal storage allocator
-    __device__ __forceinline__ _TempStorage& PrivateStorage()
-    {
-        __shared__ _TempStorage private_storage;
-        return private_storage;
-    }
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    /// Shared storage reference
-    _TempStorage &temp_storage;
-
-    /// Linear thread-id
-    unsigned int linear_tid;
-
-
-public:
-
-    /// \smemstorage{BlockReduce}
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    /******************************************************************//**
-     * \name Collective constructors
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
-     */
-    __device__ __forceinline__ BlockReduce()
-    :
-        temp_storage(PrivateStorage()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.
-     */
-    __device__ __forceinline__ BlockReduce(
-        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Generic reductions
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes a block-wide reduction for thread<sub>0</sub> using the specified binary reduction functor.  Each thread contributes one input element.
-     *
-     * \par
-     * - The return value is undefined in threads other than thread<sub>0</sub>.
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a max reduction of 128 integer items that
-     * are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockReduce for a 1D block of 128 threads on type int
-     *     typedef cub::BlockReduce<int, 128> BlockReduce;
-     *
-     *     // Allocate shared memory for BlockReduce
-     *     __shared__ typename BlockReduce::TempStorage temp_storage;
-     *
-     *     // Each thread obtains an input item
-     *     int thread_data;
-     *     ...
-     *
-     *     // Compute the block-wide max for thread0
-     *     int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max());
-     *
-     * \endcode
-     *
-     * \tparam ReductionOp          <b>[inferred]</b> Binary reduction functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ReductionOp>
-    __device__ __forceinline__ T Reduce(
-        T               input,                      ///< [in] Calling thread's input
-        ReductionOp     reduction_op)               ///< [in] Binary reduction functor 
-    {
-        return InternalBlockReduce(temp_storage).template Reduce<true>(input, BLOCK_THREADS, reduction_op);
-    }
-
-
-    /**
-     * \brief Computes a block-wide reduction for thread<sub>0</sub> using the specified binary reduction functor.  Each thread contributes an array of consecutive input elements.
-     *
-     * \par
-     * - The return value is undefined in threads other than thread<sub>0</sub>.
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a max reduction of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockReduce for a 1D block of 128 threads on type int
-     *     typedef cub::BlockReduce<int, 128> BlockReduce;
-     *
-     *     // Allocate shared memory for BlockReduce
-     *     __shared__ typename BlockReduce::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Compute the block-wide max for thread0
-     *     int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max());
-     *
-     * \endcode
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam ReductionOp          <b>[inferred]</b> Binary reduction functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <
-        int ITEMS_PER_THREAD,
-        typename ReductionOp>
-    __device__ __forceinline__ T Reduce(
-        T               (&inputs)[ITEMS_PER_THREAD],    ///< [in] Calling thread's input segment
-        ReductionOp     reduction_op)                   ///< [in] Binary reduction functor 
-    {
-        // Reduce partials
-        T partial = internal::ThreadReduce(inputs, reduction_op);
-        return Reduce(partial, reduction_op);
-    }
-
-
-    /**
-     * \brief Computes a block-wide reduction for thread<sub>0</sub> using the specified binary reduction functor.  The first \p num_valid threads each contribute one input element.
-     *
-     * \par
-     * - The return value is undefined in threads other than thread<sub>0</sub>.
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a max reduction of a partially-full tile of integer items that
-     * are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
-     *
-     * __global__ void ExampleKernel(int num_valid, ...)
-     * {
-     *     // Specialize BlockReduce for a 1D block of 128 threads on type int
-     *     typedef cub::BlockReduce<int, 128> BlockReduce;
-     *
-     *     // Allocate shared memory for BlockReduce
-     *     __shared__ typename BlockReduce::TempStorage temp_storage;
-     *
-     *     // Each thread obtains an input item
-     *     int thread_data;
-     *     if (threadIdx.x < num_valid) thread_data = ...
-     *
-     *     // Compute the block-wide max for thread0
-     *     int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max(), num_valid);
-     *
-     * \endcode
-     *
-     * \tparam ReductionOp          <b>[inferred]</b> Binary reduction functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ReductionOp>
-    __device__ __forceinline__ T Reduce(
-        T                   input,                  ///< [in] Calling thread's input
-        ReductionOp         reduction_op,           ///< [in] Binary reduction functor 
-        int                 num_valid)              ///< [in] Number of threads containing valid elements (may be less than BLOCK_THREADS)
-    {
-        // Determine if we scan skip bounds checking
-        if (num_valid >= BLOCK_THREADS)
-        {
-            return InternalBlockReduce(temp_storage).template Reduce<true>(input, num_valid, reduction_op);
-        }
-        else
-        {
-            return InternalBlockReduce(temp_storage).template Reduce<false>(input, num_valid, reduction_op);
-        }
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Summation reductions
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes a block-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator.  Each thread contributes one input element.
-     *
-     * \par
-     * - The return value is undefined in threads other than thread<sub>0</sub>.
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a sum reduction of 128 integer items that
-     * are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockReduce for a 1D block of 128 threads on type int
-     *     typedef cub::BlockReduce<int, 128> BlockReduce;
-     *
-     *     // Allocate shared memory for BlockReduce
-     *     __shared__ typename BlockReduce::TempStorage temp_storage;
-     *
-     *     // Each thread obtains an input item
-     *     int thread_data;
-     *     ...
-     *
-     *     // Compute the block-wide sum for thread0
-     *     int aggregate = BlockReduce(temp_storage).Sum(thread_data);
-     *
-     * \endcode
-     *
-     */
-    __device__ __forceinline__ T Sum(
-        T   input)                      ///< [in] Calling thread's input
-    {
-        return InternalBlockReduce(temp_storage).template Sum<true>(input, BLOCK_THREADS);
-    }
-
-    /**
-     * \brief Computes a block-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator.  Each thread contributes an array of consecutive input elements.
-     *
-     * \par
-     * - The return value is undefined in threads other than thread<sub>0</sub>.
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a sum reduction of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockReduce for a 1D block of 128 threads on type int
-     *     typedef cub::BlockReduce<int, 128> BlockReduce;
-     *
-     *     // Allocate shared memory for BlockReduce
-     *     __shared__ typename BlockReduce::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Compute the block-wide sum for thread0
-     *     int aggregate = BlockReduce(temp_storage).Sum(thread_data);
-     *
-     * \endcode
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     */
-    template <int ITEMS_PER_THREAD>
-    __device__ __forceinline__ T Sum(
-        T   (&inputs)[ITEMS_PER_THREAD])    ///< [in] Calling thread's input segment
-    {
-        // Reduce partials
-        T partial = internal::ThreadReduce(inputs, cub::Sum());
-        return Sum(partial);
-    }
-
-
-    /**
-     * \brief Computes a block-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator.  The first \p num_valid threads each contribute one input element.
-     *
-     * \par
-     * - The return value is undefined in threads other than thread<sub>0</sub>.
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a sum reduction of a partially-full tile of integer items that
-     * are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
-     *
-     * __global__ void ExampleKernel(int num_valid, ...)
-     * {
-     *     // Specialize BlockReduce for a 1D block of 128 threads on type int
-     *     typedef cub::BlockReduce<int, 128> BlockReduce;
-     *
-     *     // Allocate shared memory for BlockReduce
-     *     __shared__ typename BlockReduce::TempStorage temp_storage;
-     *
-     *     // Each thread obtains an input item (up to num_items)
-     *     int thread_data;
-     *     if (threadIdx.x < num_valid)
-     *         thread_data = ...
-     *
-     *     // Compute the block-wide sum for thread0
-     *     int aggregate = BlockReduce(temp_storage).Sum(thread_data, num_valid);
-     *
-     * \endcode
-     *
-     */
-    __device__ __forceinline__ T Sum(
-        T   input,                  ///< [in] Calling thread's input
-        int num_valid)              ///< [in] Number of threads containing valid elements (may be less than BLOCK_THREADS)
-    {
-        // Determine if we scan skip bounds checking
-        if (num_valid >= BLOCK_THREADS)
-        {
-            return InternalBlockReduce(temp_storage).template Sum<true>(input, num_valid);
-        }
-        else
-        {
-            return InternalBlockReduce(temp_storage).template Sum<false>(input, num_valid);
-        }
-    }
-
-
-    //@}  end member group
-};
-
-/**
- * \example example_block_reduce.cu
- */
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/block/block_scan.cuh b/thrust/system/cuda/detail/cub/block/block_scan.cuh
deleted file mode 100644
index c553cfbe4..000000000
--- a/thrust/system/cuda/detail/cub/block/block_scan.cuh
+++ /dev/null
@@ -1,2126 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * The cub::BlockScan class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel prefix sum/scan of items partitioned across a CUDA thread block.
- */
-
-#pragma once
-
-#include "specializations/block_scan_raking.cuh"
-#include "specializations/block_scan_warp_scans.cuh"
-#include "../util_arch.cuh"
-#include "../util_type.cuh"
-#include "../util_ptx.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Algorithmic variants
- ******************************************************************************/
-
-/**
- * \brief BlockScanAlgorithm enumerates alternative algorithms for cub::BlockScan to compute a parallel prefix scan across a CUDA thread block.
- */
-enum BlockScanAlgorithm
-{
-
-    /**
-     * \par Overview
-     * An efficient "raking reduce-then-scan" prefix scan algorithm.  Execution is comprised of five phases:
-     * -# Upsweep sequential reduction in registers (if threads contribute more than one input each).  Each thread then places the partial reduction of its item(s) into shared memory.
-     * -# Upsweep sequential reduction in shared memory.  Threads within a single warp rake across segments of shared partial reductions.
-     * -# A warp-synchronous Kogge-Stone style exclusive scan within the raking warp.
-     * -# Downsweep sequential exclusive scan in shared memory.  Threads within a single warp rake across segments of shared partial reductions, seeded with the warp-scan output.
-     * -# Downsweep sequential scan in registers (if threads contribute more than one input), seeded with the raking scan output.
-     *
-     * \par
-     * \image html block_scan_raking.png
-     * <div class="centercaption">\p BLOCK_SCAN_RAKING data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
-     *
-     * \par Performance Considerations
-     * - Although this variant may suffer longer turnaround latencies when the
-     *   GPU is under-occupied, it can often provide higher overall throughput
-     *   across the GPU when suitably occupied.
-     */
-    BLOCK_SCAN_RAKING,
-
-
-    /**
-     * \par Overview
-     * Similar to cub::BLOCK_SCAN_RAKING, but with fewer shared memory reads at
-     * the expense of higher register pressure.  Raking threads preserve their
-     * "upsweep" segment of values in registers while performing warp-synchronous
-     * scan, allowing the "downsweep" not to re-read them from shared memory.
-     */
-    BLOCK_SCAN_RAKING_MEMOIZE,
-
-
-    /**
-     * \par Overview
-     * A quick "tiled warpscans" prefix scan algorithm.  Execution is comprised of four phases:
-     * -# Upsweep sequential reduction in registers (if threads contribute more than one input each).  Each thread then places the partial reduction of its item(s) into shared memory.
-     * -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style scan within each warp.
-     * -# A propagation phase where the warp scan outputs in each warp are updated with the aggregate from each preceding warp.
-     * -# Downsweep sequential scan in registers (if threads contribute more than one input), seeded with the raking scan output.
-     *
-     * \par
-     * \image html block_scan_warpscans.png
-     * <div class="centercaption">\p BLOCK_SCAN_WARP_SCANS data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
-     *
-     * \par Performance Considerations
-     * - Although this variant may suffer lower overall throughput across the
-     *   GPU because due to a heavy reliance on inefficient warpscans, it can
-     *   often provide lower turnaround latencies when the GPU is under-occupied.
-     */
-    BLOCK_SCAN_WARP_SCANS,
-};
-
-
-/******************************************************************************
- * Block scan
- ******************************************************************************/
-
-/**
- * \brief The BlockScan class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel prefix sum/scan of items partitioned across a CUDA thread block. ![](block_scan_logo.png)
- * \ingroup BlockModule
- *
- * \tparam T                Data type being scanned
- * \tparam BLOCK_DIM_X      The thread block length in threads along the X dimension
- * \tparam ALGORITHM        <b>[optional]</b> cub::BlockScanAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_SCAN_RAKING)
- * \tparam BLOCK_DIM_Y      <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
- * \tparam BLOCK_DIM_Z      <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- * \tparam PTX_ARCH         <b>[optional]</b> \ptxversion
- *
- * \par Overview
- * - Given a list of input elements and a binary reduction operator, a [<em>prefix scan</em>](http://en.wikipedia.org/wiki/Prefix_sum)
- *   produces an output list where each element is computed to be the reduction
- *   of the elements occurring earlier in the input list.  <em>Prefix sum</em>
- *   connotes a prefix scan with the addition operator. The term \em inclusive indicates
- *   that the <em>i</em><sup>th</sup> output reduction incorporates the <em>i</em><sup>th</sup> input.
- *   The term \em exclusive indicates the <em>i</em><sup>th</sup> input is not incorporated into
- *   the <em>i</em><sup>th</sup> output reduction.
- * - \rowmajor
- * - BlockScan can be optionally specialized by algorithm to accommodate different workload profiles:
- *   -# <b>cub::BLOCK_SCAN_RAKING</b>.  An efficient (high throughput) "raking reduce-then-scan" prefix scan algorithm. [More...](\ref cub::BlockScanAlgorithm)
- *   -# <b>cub::BLOCK_SCAN_RAKING_MEMOIZE</b>.  Similar to cub::BLOCK_SCAN_RAKING, but having higher throughput at the expense of additional register pressure for intermediate storage. [More...](\ref cub::BlockScanAlgorithm)
- *   -# <b>cub::BLOCK_SCAN_WARP_SCANS</b>.  A quick (low latency) "tiled warpscans" prefix scan algorithm. [More...](\ref cub::BlockScanAlgorithm)
- *
- * \par Performance Considerations
- * - \granularity
- * - Uses special instructions when applicable (e.g., warp \p SHFL)
- * - Uses synchronization-free communication between warp lanes when applicable
- * - Invokes a minimal number of minimal block-wide synchronization barriers (only
- *   one or two depending on algorithm selection)
- * - Incurs zero bank conflicts for most types
- * - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
- *   - Prefix sum variants (<b><em>vs.</em></b> generic scan)
- *   - \blocksize
- * - See cub::BlockScanAlgorithm for performance details regarding algorithmic alternatives
- *
- * \par A Simple Example
- * \blockcollective{BlockScan}
- * \par
- * The code snippet below illustrates an exclusive prefix sum of 512 integer items that
- * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
- * where each thread owns 4 consecutive items.
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
- *
- * __global__ void ExampleKernel(...)
- * {
- *     // Specialize BlockScan for a 1D block of 128 threads on type int
- *     typedef cub::BlockScan<int, 128> BlockScan;
- *
- *     // Allocate shared memory for BlockScan
- *     __shared__ typename BlockScan::TempStorage temp_storage;
- *
- *     // Obtain a segment of consecutive items that are blocked across threads
- *     int thread_data[4];
- *     ...
- *
- *     // Collectively compute the block-wide exclusive prefix sum
- *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
- *
- * \endcode
- * \par
- * Suppose the set of input \p thread_data across the block of threads is
- * <tt>{[1,1,1,1], [1,1,1,1], ..., [1,1,1,1]}</tt>.
- * The corresponding output \p thread_data in those threads will be
- * <tt>{[0,1,2,3], [4,5,6,7], ..., [508,509,510,511]}</tt>.
- *
- */
-template <
-    typename            T,
-    int                 BLOCK_DIM_X,
-    BlockScanAlgorithm  ALGORITHM       = BLOCK_SCAN_RAKING,
-    int                 BLOCK_DIM_Y     = 1,
-    int                 BLOCK_DIM_Z     = 1,
-    int                 PTX_ARCH        = CUB_PTX_ARCH>
-class BlockScan
-{
-private:
-
-    /******************************************************************************
-     * Constants and type definitions
-     ******************************************************************************/
-
-    /// Constants
-    enum
-    {
-        /// The thread block size in threads
-        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-    };
-
-    /**
-     * Ensure the template parameterization meets the requirements of the
-     * specified algorithm. Currently, the BLOCK_SCAN_WARP_SCANS policy
-     * cannot be used with thread block sizes not a multiple of the
-     * architectural warp size.
-     */
-    static const BlockScanAlgorithm SAFE_ALGORITHM =
-        ((ALGORITHM == BLOCK_SCAN_WARP_SCANS) && (BLOCK_THREADS % CUB_WARP_THREADS(PTX_ARCH) != 0)) ?
-            BLOCK_SCAN_RAKING :
-            ALGORITHM;
-
-    typedef BlockScanWarpScans<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> WarpScans;
-    typedef BlockScanRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, (SAFE_ALGORITHM == BLOCK_SCAN_RAKING_MEMOIZE), PTX_ARCH> Raking;
-
-    /// Define the delegate type for the desired algorithm
-    typedef typename If<(SAFE_ALGORITHM == BLOCK_SCAN_WARP_SCANS),
-        WarpScans,
-        Raking>::Type InternalBlockScan;
-
-    /// Shared memory storage layout type for BlockScan
-    typedef typename InternalBlockScan::TempStorage _TempStorage;
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    /// Shared storage reference
-    _TempStorage &temp_storage;
-
-    /// Linear thread-id
-    unsigned int linear_tid;
-
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-    /// Internal storage allocator
-    __device__ __forceinline__ _TempStorage& PrivateStorage()
-    {
-        __shared__ _TempStorage private_storage;
-        return private_storage;
-    }
-
-
-    /******************************************************************************
-     * Public types
-     ******************************************************************************/
-public:
-
-    /// \smemstorage{BlockScan}
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    /******************************************************************//**
-     * \name Collective constructors
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
-     */
-    __device__ __forceinline__ BlockScan()
-    :
-        temp_storage(PrivateStorage()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.
-     */
-    __device__ __forceinline__ BlockScan(
-        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Exclusive prefix sum operations
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  The value of 0 is applied as the initial value, and is assigned to \p output in <em>thread</em><sub>0</sub>.
-     *
-     * \par
-     * - \identityzero
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates an exclusive prefix sum of 128 integer items that
-     * are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain input item for each thread
-     *     int thread_data;
-     *     ...
-     *
-     *     // Collectively compute the block-wide exclusive prefix sum
-     *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>1, 1, ..., 1</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>0, 1, ..., 127</tt>.
-     *
-     */
-    __device__ __forceinline__ void ExclusiveSum(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output)                        ///< [out] Calling thread's output item (may be aliased to \p input)
-    {
-        T initial_value = 0;
-        ExclusiveScan(input, output, initial_value, cub::Sum());
-    }
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  The value of 0 is applied as the initial value, and is assigned to \p output in <em>thread</em><sub>0</sub>.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - \identityzero
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates an exclusive prefix sum of 128 integer items that
-     * are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain input item for each thread
-     *     int thread_data;
-     *     ...
-     *
-     *     // Collectively compute the block-wide exclusive prefix sum
-     *     int block_aggregate;
-     *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>1, 1, ..., 1</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>0, 1, ..., 127</tt>.
-     * Furthermore the value \p 128 will be stored in \p block_aggregate for all threads.
-     *
-     */
-    __device__ __forceinline__ void ExclusiveSum(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
-    {
-        T initial_value = 0;
-        ExclusiveScan(input, output, initial_value, cub::Sum(), block_aggregate);
-    }
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - \identityzero
-     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
-     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
-     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
-     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a single thread block that progressively
-     * computes an exclusive prefix sum over multiple "tiles" of input using a
-     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
-     * of 128 integer items that are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * // A stateful callback functor that maintains a running prefix to be applied
-     * // during consecutive scan operations.
-     * struct BlockPrefixCallbackOp
-     * {
-     *     // Running prefix
-     *     int running_total;
-     *
-     *     // Constructor
-     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
-     *
-     *     // Callback operator to be entered by the first warp of threads in the block.
-     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
-     *     __device__ int operator()(int block_aggregate)
-     *     {
-     *         int old_prefix = running_total;
-     *         running_total += block_aggregate;
-     *         return old_prefix;
-     *     }
-     * };
-     *
-     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Initialize running total
-     *     BlockPrefixCallbackOp prefix_op(0);
-     *
-     *     // Have the block iterate over segments of items
-     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128)
-     *     {
-     *         // Load a segment of consecutive items that are blocked across threads
-     *         int thread_data = d_data[block_offset];
-     *
-     *         // Collectively compute the block-wide exclusive prefix sum
-     *         BlockScan(temp_storage).ExclusiveSum(
-     *             thread_data, thread_data, prefix_op);
-     *         CTA_SYNC();
-     *
-     *         // Store scanned items to output segment
-     *         d_data[block_offset] = thread_data;
-     *     }
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>1, 1, 1, 1, 1, 1, 1, 1, ...</tt>.
-     * The corresponding output for the first segment will be <tt>0, 1, ..., 127</tt>.
-     * The output for the second segment will be <tt>128, 129, ..., 255</tt>.
-     *
-     * \tparam BlockPrefixCallbackOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
-     */
-    template <typename BlockPrefixCallbackOp>
-    __device__ __forceinline__ void ExclusiveSum(
-        T                       input,                          ///< [in] Calling thread's input item
-        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
-    {
-        ExclusiveScan(input, output, cub::Sum(), block_prefix_callback_op);
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Exclusive prefix sum operations (multiple data per thread)
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.  The value of 0 is applied as the initial value, and is assigned to \p output[0] in <em>thread</em><sub>0</sub>.
-     *
-     * \par
-     * - \identityzero
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates an exclusive prefix sum of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively compute the block-wide exclusive prefix sum
-     *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     */
-    template <int ITEMS_PER_THREAD>
-    __device__ __forceinline__ void ExclusiveSum(
-        T                 (&input)[ITEMS_PER_THREAD],   ///< [in] Calling thread's input items
-        T                 (&output)[ITEMS_PER_THREAD])  ///< [out] Calling thread's output items (may be aliased to \p input)
-    {
-        T initial_value = 0;
-        ExclusiveScan(input, output, initial_value, cub::Sum());
-    }
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.  The value of 0 is applied as the initial value, and is assigned to \p output[0] in <em>thread</em><sub>0</sub>.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - \identityzero
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates an exclusive prefix sum of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively compute the block-wide exclusive prefix sum
-     *     int block_aggregate;
-     *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
-     * Furthermore the value \p 512 will be stored in \p block_aggregate for all threads.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     */
-    template <int ITEMS_PER_THREAD>
-    __device__ __forceinline__ void ExclusiveSum(
-        T                 (&input)[ITEMS_PER_THREAD],       ///< [in] Calling thread's input items
-        T                 (&output)[ITEMS_PER_THREAD],      ///< [out] Calling thread's output items (may be aliased to \p input)
-        T                 &block_aggregate)                 ///< [out] block-wide aggregate reduction of input items
-    {
-        // Reduce consecutive thread items in registers
-        T initial_value = 0;
-        ExclusiveScan(input, output, initial_value, cub::Sum(), block_aggregate);
-    }
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.  Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - \identityzero
-     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
-     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
-     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
-     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a single thread block that progressively
-     * computes an exclusive prefix sum over multiple "tiles" of input using a
-     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
-     * of 512 integer items that are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3)
-     * across 128 threads where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * // A stateful callback functor that maintains a running prefix to be applied
-     * // during consecutive scan operations.
-     * struct BlockPrefixCallbackOp
-     * {
-     *     // Running prefix
-     *     int running_total;
-     *
-     *     // Constructor
-     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
-     *
-     *     // Callback operator to be entered by the first warp of threads in the block.
-     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
-     *     __device__ int operator()(int block_aggregate)
-     *     {
-     *         int old_prefix = running_total;
-     *         running_total += block_aggregate;
-     *         return old_prefix;
-     *     }
-     * };
-     *
-     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
-     * {
-     *     // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
-     *     typedef cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE>   BlockLoad;
-     *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_TRANSPOSE>  BlockStore;
-     *     typedef cub::BlockScan<int, 128>                             BlockScan;
-     *
-     *     // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
-     *     __shared__ union {
-     *         typename BlockLoad::TempStorage     load;
-     *         typename BlockScan::TempStorage     scan;
-     *         typename BlockStore::TempStorage    store;
-     *     } temp_storage;
-     *
-     *     // Initialize running total
-     *     BlockPrefixCallbackOp prefix_op(0);
-     *
-     *     // Have the block iterate over segments of items
-     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
-     *     {
-     *         // Load a segment of consecutive items that are blocked across threads
-     *         int thread_data[4];
-     *         BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
-     *         CTA_SYNC();
-     *
-     *         // Collectively compute the block-wide exclusive prefix sum
-     *         int block_aggregate;
-     *         BlockScan(temp_storage.scan).ExclusiveSum(
-     *             thread_data, thread_data, prefix_op);
-     *         CTA_SYNC();
-     *
-     *         // Store scanned items to output segment
-     *         BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
-     *         CTA_SYNC();
-     *     }
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>1, 1, 1, 1, 1, 1, 1, 1, ...</tt>.
-     * The corresponding output for the first segment will be <tt>0, 1, 2, 3, ..., 510, 511</tt>.
-     * The output for the second segment will be <tt>512, 513, 514, 515, ..., 1022, 1023</tt>.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam BlockPrefixCallbackOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
-     */
-    template <
-        int ITEMS_PER_THREAD,
-        typename BlockPrefixCallbackOp>
-    __device__ __forceinline__ void ExclusiveSum(
-        T                       (&input)[ITEMS_PER_THREAD],   ///< [in] Calling thread's input items
-        T                       (&output)[ITEMS_PER_THREAD],  ///< [out] Calling thread's output items (may be aliased to \p input)
-        BlockPrefixCallbackOp   &block_prefix_callback_op)    ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
-    {
-        ExclusiveScan(input, output, cub::Sum(), block_prefix_callback_op);
-    }
-
-
-
-    //@}  end member group        // Exclusive prefix sums
-    /******************************************************************//**
-     * \name Exclusive prefix scan operations
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
-     *
-     * \par
-     * - Supports non-commutative scan operators.
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates an exclusive prefix max scan of 128 integer items that
-     * are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain input item for each thread
-     *     int thread_data;
-     *     ...
-     *
-     *     // Collectively compute the block-wide exclusive prefix max scan
-     *     BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>0, -1, 2, -3, ..., 126, -127</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>INT_MIN, 0, 0, 2, ..., 124, 126</tt>.
-     *
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        T               initial_value,                  ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in <em>thread</em><sub>0</sub>)
-        ScanOp          scan_op)                        ///< [in] Binary scan functor 
-    {
-        InternalBlockScan(temp_storage).ExclusiveScan(input, output, initial_value, scan_op);
-    }
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - Supports non-commutative scan operators.
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates an exclusive prefix max scan of 128 integer items that
-     * are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain input item for each thread
-     *     int thread_data;
-     *     ...
-     *
-     *     // Collectively compute the block-wide exclusive prefix max scan
-     *     int block_aggregate;
-     *     BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>0, -1, 2, -3, ..., 126, -127</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>INT_MIN, 0, 0, 2, ..., 124, 126</tt>.
-     * Furthermore the value \p 126 will be stored in \p block_aggregate for all threads.
-     *
-     * \tparam ScanOp   <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input items
-        T               &output,            ///< [out] Calling thread's output items (may be aliased to \p input)
-        T               initial_value,      ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in <em>thread</em><sub>0</sub>)
-        ScanOp          scan_op,            ///< [in] Binary scan functor 
-        T               &block_aggregate)   ///< [out] block-wide aggregate reduction of input items
-    {
-        InternalBlockScan(temp_storage).ExclusiveScan(input, output, initial_value, scan_op, block_aggregate);
-    }
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
-     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
-     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
-     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
-     * - Supports non-commutative scan operators.
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a single thread block that progressively
-     * computes an exclusive prefix max scan over multiple "tiles" of input using a
-     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
-     * of 128 integer items that are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * // A stateful callback functor that maintains a running prefix to be applied
-     * // during consecutive scan operations.
-     * struct BlockPrefixCallbackOp
-     * {
-     *     // Running prefix
-     *     int running_total;
-     *
-     *     // Constructor
-     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
-     *
-     *     // Callback operator to be entered by the first warp of threads in the block.
-     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
-     *     __device__ int operator()(int block_aggregate)
-     *     {
-     *         int old_prefix = running_total;
-     *         running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
-     *         return old_prefix;
-     *     }
-     * };
-     *
-     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Initialize running total
-     *     BlockPrefixCallbackOp prefix_op(INT_MIN);
-     *
-     *     // Have the block iterate over segments of items
-     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128)
-     *     {
-     *         // Load a segment of consecutive items that are blocked across threads
-     *         int thread_data = d_data[block_offset];
-     *
-     *         // Collectively compute the block-wide exclusive prefix max scan
-     *         BlockScan(temp_storage).ExclusiveScan(
-     *             thread_data, thread_data, INT_MIN, cub::Max(), prefix_op);
-     *         CTA_SYNC();
-     *
-     *         // Store scanned items to output segment
-     *         d_data[block_offset] = thread_data;
-     *     }
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>0, -1, 2, -3, 4, -5, ...</tt>.
-     * The corresponding output for the first segment will be <tt>INT_MIN, 0, 0, 2, ..., 124, 126</tt>.
-     * The output for the second segment will be <tt>126, 128, 128, 130, ..., 252, 254</tt>.
-     *
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
-     * \tparam BlockPrefixCallbackOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
-     */
-    template <
-        typename ScanOp,
-        typename BlockPrefixCallbackOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T                       input,                          ///< [in] Calling thread's input item
-        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp                  scan_op,                        ///< [in] Binary scan functor 
-        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
-    {
-        InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_prefix_callback_op);
-    }
-
-
-    //@}  end member group        // Inclusive prefix sums
-    /******************************************************************//**
-     * \name Exclusive prefix scan operations (multiple data per thread)
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.
-     *
-     * \par
-     * - Supports non-commutative scan operators.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates an exclusive prefix max scan of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively compute the block-wide exclusive prefix max scan
-     *     BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is
-     * <tt>{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }</tt>.
-     * The corresponding output \p thread_data in those threads will be
-     * <tt>{ [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }</tt>.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T                 (&input)[ITEMS_PER_THREAD],   ///< [in] Calling thread's input items
-        T                 (&output)[ITEMS_PER_THREAD],  ///< [out] Calling thread's output items (may be aliased to \p input)
-        T                 initial_value,                ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in <em>thread</em><sub>0</sub>)
-        ScanOp            scan_op)                      ///< [in] Binary scan functor
-    {
-        // Reduce consecutive thread items in registers
-        T thread_prefix = internal::ThreadReduce(input, scan_op);
-
-        // Exclusive thread block-scan
-        ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op);
-
-        // Exclusive scan in registers with prefix as seed
-        internal::ThreadScanExclusive(input, output, scan_op, thread_prefix);
-    }
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - Supports non-commutative scan operators.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates an exclusive prefix max scan of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively compute the block-wide exclusive prefix max scan
-     *     int block_aggregate;
-     *     BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>{ [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }</tt>.
-     * Furthermore the value \p 510 will be stored in \p block_aggregate for all threads.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T                 (&input)[ITEMS_PER_THREAD],   ///< [in] Calling thread's input items
-        T                 (&output)[ITEMS_PER_THREAD],  ///< [out] Calling thread's output items (may be aliased to \p input)
-        T                 initial_value,                ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in <em>thread</em><sub>0</sub>)
-        ScanOp            scan_op,                      ///< [in] Binary scan functor
-        T                 &block_aggregate)             ///< [out] block-wide aggregate reduction of input items
-    {
-        // Reduce consecutive thread items in registers
-        T thread_prefix = internal::ThreadReduce(input, scan_op);
-
-        // Exclusive thread block-scan
-        ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op, block_aggregate);
-
-        // Exclusive scan in registers with prefix as seed
-        internal::ThreadScanExclusive(input, output, scan_op, thread_prefix);
-    }
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
-     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
-     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
-     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
-     * - Supports non-commutative scan operators.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a single thread block that progressively
-     * computes an exclusive prefix max scan over multiple "tiles" of input using a
-     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
-     * of 128 integer items that are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * // A stateful callback functor that maintains a running prefix to be applied
-     * // during consecutive scan operations.
-     * struct BlockPrefixCallbackOp
-     * {
-     *     // Running prefix
-     *     int running_total;
-     *
-     *     // Constructor
-     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
-     *
-     *     // Callback operator to be entered by the first warp of threads in the block.
-     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
-     *     __device__ int operator()(int block_aggregate)
-     *     {
-     *         int old_prefix = running_total;
-     *         running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
-     *         return old_prefix;
-     *     }
-     * };
-     *
-     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
-     * {
-     *     // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
-     *     typedef cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE>   BlockLoad;
-     *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_TRANSPOSE>  BlockStore;
-     *     typedef cub::BlockScan<int, 128>                             BlockScan;
-     *
-     *     // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
-     *     __shared__ union {
-     *         typename BlockLoad::TempStorage     load;
-     *         typename BlockScan::TempStorage     scan;
-     *         typename BlockStore::TempStorage    store;
-     *     } temp_storage;
-     *
-     *     // Initialize running total
-     *     BlockPrefixCallbackOp prefix_op(0);
-     *
-     *     // Have the block iterate over segments of items
-     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
-     *     {
-     *         // Load a segment of consecutive items that are blocked across threads
-     *         int thread_data[4];
-     *         BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
-     *         CTA_SYNC();
-     *
-     *         // Collectively compute the block-wide exclusive prefix max scan
-     *         BlockScan(temp_storage.scan).ExclusiveScan(
-     *             thread_data, thread_data, INT_MIN, cub::Max(), prefix_op);
-     *         CTA_SYNC();
-     *
-     *         // Store scanned items to output segment
-     *         BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
-     *         CTA_SYNC();
-     *     }
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>0, -1, 2, -3, 4, -5, ...</tt>.
-     * The corresponding output for the first segment will be <tt>INT_MIN, 0, 0, 2, 2, 4, ..., 508, 510</tt>.
-     * The output for the second segment will be <tt>510, 512, 512, 514, 514, 516, ..., 1020, 1022</tt>.
-     *
-     * \tparam ITEMS_PER_THREAD         <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam ScanOp                   <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
-     * \tparam BlockPrefixCallbackOp    <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        ScanOp,
-        typename        BlockPrefixCallbackOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T                       (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
-        T                       (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
-        ScanOp                  scan_op,                        ///< [in] Binary scan functor
-        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
-    {
-        // Reduce consecutive thread items in registers
-        T thread_prefix = internal::ThreadReduce(input, scan_op);
-
-        // Exclusive thread block-scan
-        ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_prefix_callback_op);
-
-        // Exclusive scan in registers with prefix as seed
-        internal::ThreadScanExclusive(input, output, scan_op, thread_prefix);
-    }
-
-
-    //@}  end member group
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document no-initial-value scans
-
-    /******************************************************************//**
-     * \name Exclusive prefix scan operations (no initial value, single datum per thread)
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
-     *
-     * \par
-     * - Supports non-commutative scan operators.
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op)                        ///< [in] Binary scan functor
-    {
-        InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op);
-    }
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
-     *
-     * \par
-     * - Supports non-commutative scan operators.
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \tparam ScanOp   <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op,                        ///< [in] Binary scan functor
-        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
-    {
-        InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_aggregate);
-    }
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Exclusive prefix scan operations (no initial value, multiple data per thread)
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
-     *
-     * \par
-     * - Supports non-commutative scan operators.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T                 (&input)[ITEMS_PER_THREAD],   ///< [in] Calling thread's input items
-        T                 (&output)[ITEMS_PER_THREAD],  ///< [out] Calling thread's output items (may be aliased to \p input)
-        ScanOp            scan_op)                      ///< [in] Binary scan functor
-    {
-        // Reduce consecutive thread items in registers
-        T thread_partial = internal::ThreadReduce(input, scan_op);
-
-        // Exclusive thread block-scan
-        ExclusiveScan(thread_partial, thread_partial, scan_op);
-
-        // Exclusive scan in registers with prefix
-        internal::ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
-    }
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
-     *
-     * \par
-     * - Supports non-commutative scan operators.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
-        T               (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
-        ScanOp          scan_op,                        ///< [in] Binary scan functor
-        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
-    {
-        // Reduce consecutive thread items in registers
-        T thread_partial = internal::ThreadReduce(input, scan_op);
-
-        // Exclusive thread block-scan
-        ExclusiveScan(thread_partial, thread_partial, scan_op, block_aggregate);
-
-        // Exclusive scan in registers with prefix
-        internal::ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
-    }
-
-
-    //@}  end member group
-#endif // DOXYGEN_SHOULD_SKIP_THIS  // Do not document no-initial-value scans
-
-    /******************************************************************//**
-     * \name Inclusive prefix sum operations
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.
-     *
-     * \par
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates an inclusive prefix sum of 128 integer items that
-     * are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain input item for each thread
-     *     int thread_data;
-     *     ...
-     *
-     *     // Collectively compute the block-wide inclusive prefix sum
-     *     BlockScan(temp_storage).InclusiveSum(thread_data, thread_data);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>1, 1, ..., 1</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>1, 2, ..., 128</tt>.
-     *
-     */
-    __device__ __forceinline__ void InclusiveSum(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output)                        ///< [out] Calling thread's output item (may be aliased to \p input)
-    {
-        InclusiveScan(input, output, cub::Sum());
-    }
-
-
-    /**
-     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates an inclusive prefix sum of 128 integer items that
-     * are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain input item for each thread
-     *     int thread_data;
-     *     ...
-     *
-     *     // Collectively compute the block-wide inclusive prefix sum
-     *     int block_aggregate;
-     *     BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>1, 1, ..., 1</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>1, 2, ..., 128</tt>.
-     * Furthermore the value \p 128 will be stored in \p block_aggregate for all threads.
-     *
-     */
-    __device__ __forceinline__ void InclusiveSum(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
-    {
-        InclusiveScan(input, output, cub::Sum(), block_aggregate);
-    }
-
-
-
-    /**
-     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
-     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
-     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
-     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a single thread block that progressively
-     * computes an inclusive prefix sum over multiple "tiles" of input using a
-     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
-     * of 128 integer items that are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * // A stateful callback functor that maintains a running prefix to be applied
-     * // during consecutive scan operations.
-     * struct BlockPrefixCallbackOp
-     * {
-     *     // Running prefix
-     *     int running_total;
-     *
-     *     // Constructor
-     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
-     *
-     *     // Callback operator to be entered by the first warp of threads in the block.
-     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
-     *     __device__ int operator()(int block_aggregate)
-     *     {
-     *         int old_prefix = running_total;
-     *         running_total += block_aggregate;
-     *         return old_prefix;
-     *     }
-     * };
-     *
-     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Initialize running total
-     *     BlockPrefixCallbackOp prefix_op(0);
-     *
-     *     // Have the block iterate over segments of items
-     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128)
-     *     {
-     *         // Load a segment of consecutive items that are blocked across threads
-     *         int thread_data = d_data[block_offset];
-     *
-     *         // Collectively compute the block-wide inclusive prefix sum
-     *         BlockScan(temp_storage).InclusiveSum(
-     *             thread_data, thread_data, prefix_op);
-     *         CTA_SYNC();
-     *
-     *         // Store scanned items to output segment
-     *         d_data[block_offset] = thread_data;
-     *     }
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>1, 1, 1, 1, 1, 1, 1, 1, ...</tt>.
-     * The corresponding output for the first segment will be <tt>1, 2, ..., 128</tt>.
-     * The output for the second segment will be <tt>129, 130, ..., 256</tt>.
-     *
-     * \tparam BlockPrefixCallbackOp          <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
-     */
-    template <typename BlockPrefixCallbackOp>
-    __device__ __forceinline__ void InclusiveSum(
-        T                       input,                          ///< [in] Calling thread's input item
-        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
-    {
-        InclusiveScan(input, output, cub::Sum(), block_prefix_callback_op);
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Inclusive prefix sum operations (multiple data per thread)
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.
-     *
-     * \par
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates an inclusive prefix sum of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively compute the block-wide inclusive prefix sum
-     *     BlockScan(temp_storage).InclusiveSum(thread_data, thread_data);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>{ [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }</tt>.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     */
-    template <int ITEMS_PER_THREAD>
-    __device__ __forceinline__ void InclusiveSum(
-        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
-        T               (&output)[ITEMS_PER_THREAD])    ///< [out] Calling thread's output items (may be aliased to \p input)
-    {
-        if (ITEMS_PER_THREAD == 1)
-        {
-            InclusiveSum(input[0], output[0]);
-        }
-        else
-        {
-            // Reduce consecutive thread items in registers
-            Sum scan_op;
-            T thread_prefix = internal::ThreadReduce(input, scan_op);
-
-            // Exclusive thread block-scan
-            ExclusiveSum(thread_prefix, thread_prefix);
-
-            // Inclusive scan in registers with prefix as seed
-            internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
-        }
-    }
-
-
-    /**
-     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates an inclusive prefix sum of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively compute the block-wide inclusive prefix sum
-     *     int block_aggregate;
-     *     BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is
-     * <tt>{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }</tt>.  The
-     * corresponding output \p thread_data in those threads will be
-     * <tt>{ [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }</tt>.
-     * Furthermore the value \p 512 will be stored in \p block_aggregate for all threads.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <int ITEMS_PER_THREAD>
-    __device__ __forceinline__ void InclusiveSum(
-        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
-        T               (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
-        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
-    {
-        if (ITEMS_PER_THREAD == 1)
-        {
-            InclusiveSum(input[0], output[0], block_aggregate);
-        }
-        else
-        {
-            // Reduce consecutive thread items in registers
-            Sum scan_op;
-            T thread_prefix = internal::ThreadReduce(input, scan_op);
-
-            // Exclusive thread block-scan
-            ExclusiveSum(thread_prefix, thread_prefix, block_aggregate);
-
-            // Inclusive scan in registers with prefix as seed
-            internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
-        }
-    }
-
-
-    /**
-     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.  Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
-     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
-     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
-     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a single thread block that progressively
-     * computes an inclusive prefix sum over multiple "tiles" of input using a
-     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
-     * of 512 integer items that are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3)
-     * across 128 threads where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * // A stateful callback functor that maintains a running prefix to be applied
-     * // during consecutive scan operations.
-     * struct BlockPrefixCallbackOp
-     * {
-     *     // Running prefix
-     *     int running_total;
-     *
-     *     // Constructor
-     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
-     *
-     *     // Callback operator to be entered by the first warp of threads in the block.
-     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
-     *     __device__ int operator()(int block_aggregate)
-     *     {
-     *         int old_prefix = running_total;
-     *         running_total += block_aggregate;
-     *         return old_prefix;
-     *     }
-     * };
-     *
-     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
-     * {
-     *     // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
-     *     typedef cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE>   BlockLoad;
-     *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_TRANSPOSE>  BlockStore;
-     *     typedef cub::BlockScan<int, 128>                             BlockScan;
-     *
-     *     // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
-     *     __shared__ union {
-     *         typename BlockLoad::TempStorage     load;
-     *         typename BlockScan::TempStorage     scan;
-     *         typename BlockStore::TempStorage    store;
-     *     } temp_storage;
-     *
-     *     // Initialize running total
-     *     BlockPrefixCallbackOp prefix_op(0);
-     *
-     *     // Have the block iterate over segments of items
-     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
-     *     {
-     *         // Load a segment of consecutive items that are blocked across threads
-     *         int thread_data[4];
-     *         BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
-     *         CTA_SYNC();
-     *
-     *         // Collectively compute the block-wide inclusive prefix sum
-     *         BlockScan(temp_storage.scan).IncluisveSum(
-     *             thread_data, thread_data, prefix_op);
-     *         CTA_SYNC();
-     *
-     *         // Store scanned items to output segment
-     *         BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
-     *         CTA_SYNC();
-     *     }
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>1, 1, 1, 1, 1, 1, 1, 1, ...</tt>.
-     * The corresponding output for the first segment will be <tt>1, 2, 3, 4, ..., 511, 512</tt>.
-     * The output for the second segment will be <tt>513, 514, 515, 516, ..., 1023, 1024</tt>.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam BlockPrefixCallbackOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
-     */
-    template <
-        int ITEMS_PER_THREAD,
-        typename BlockPrefixCallbackOp>
-    __device__ __forceinline__ void InclusiveSum(
-        T                       (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
-        T                       (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
-        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
-    {
-        if (ITEMS_PER_THREAD == 1)
-        {
-            InclusiveSum(input[0], output[0], block_prefix_callback_op);
-        }
-        else
-        {
-            // Reduce consecutive thread items in registers
-            Sum scan_op;
-            T thread_prefix = internal::ThreadReduce(input, scan_op);
-
-            // Exclusive thread block-scan
-            ExclusiveSum(thread_prefix, thread_prefix, block_prefix_callback_op);
-
-            // Inclusive scan in registers with prefix as seed
-            internal::ThreadScanInclusive(input, output, scan_op, thread_prefix);
-        }
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Inclusive prefix scan operations
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
-     *
-     * \par
-     * - Supports non-commutative scan operators.
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates an inclusive prefix max scan of 128 integer items that
-     * are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain input item for each thread
-     *     int thread_data;
-     *     ...
-     *
-     *     // Collectively compute the block-wide inclusive prefix max scan
-     *     BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>0, -1, 2, -3, ..., 126, -127</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>0, 0, 2, 2, ..., 126, 126</tt>.
-     *
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op)                        ///< [in] Binary scan functor 
-    {
-        InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op);
-    }
-
-
-    /**
-     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - Supports non-commutative scan operators.
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates an inclusive prefix max scan of 128 integer items that
-     * are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain input item for each thread
-     *     int thread_data;
-     *     ...
-     *
-     *     // Collectively compute the block-wide inclusive prefix max scan
-     *     int block_aggregate;
-     *     BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max(), block_aggregate);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>0, -1, 2, -3, ..., 126, -127</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>0, 0, 2, 2, ..., 126, 126</tt>.
-     * Furthermore the value \p 126 will be stored in \p block_aggregate for all threads.
-     *
-     * \tparam ScanOp   <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op,                        ///< [in] Binary scan functor 
-        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
-    {
-        InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_aggregate);
-    }
-
-
-    /**
-     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
-     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
-     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
-     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
-     * - Supports non-commutative scan operators.
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a single thread block that progressively
-     * computes an inclusive prefix max scan over multiple "tiles" of input using a
-     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
-     * of 128 integer items that are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * // A stateful callback functor that maintains a running prefix to be applied
-     * // during consecutive scan operations.
-     * struct BlockPrefixCallbackOp
-     * {
-     *     // Running prefix
-     *     int running_total;
-     *
-     *     // Constructor
-     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
-     *
-     *     // Callback operator to be entered by the first warp of threads in the block.
-     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
-     *     __device__ int operator()(int block_aggregate)
-     *     {
-     *         int old_prefix = running_total;
-     *         running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
-     *         return old_prefix;
-     *     }
-     * };
-     *
-     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Initialize running total
-     *     BlockPrefixCallbackOp prefix_op(INT_MIN);
-     *
-     *     // Have the block iterate over segments of items
-     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128)
-     *     {
-     *         // Load a segment of consecutive items that are blocked across threads
-     *         int thread_data = d_data[block_offset];
-     *
-     *         // Collectively compute the block-wide inclusive prefix max scan
-     *         BlockScan(temp_storage).InclusiveScan(
-     *             thread_data, thread_data, cub::Max(), prefix_op);
-     *         CTA_SYNC();
-     *
-     *         // Store scanned items to output segment
-     *         d_data[block_offset] = thread_data;
-     *     }
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>0, -1, 2, -3, 4, -5, ...</tt>.
-     * The corresponding output for the first segment will be <tt>0, 0, 2, 2, ..., 126, 126</tt>.
-     * The output for the second segment will be <tt>128, 128, 130, 130, ..., 254, 254</tt>.
-     *
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
-     * \tparam BlockPrefixCallbackOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
-     */
-    template <
-        typename ScanOp,
-        typename BlockPrefixCallbackOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T                       input,                          ///< [in] Calling thread's input item
-        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp                  scan_op,                        ///< [in] Binary scan functor 
-        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
-    {
-        InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_prefix_callback_op);
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Inclusive prefix scan operations (multiple data per thread)
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.
-     *
-     * \par
-     * - Supports non-commutative scan operators.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates an inclusive prefix max scan of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively compute the block-wide inclusive prefix max scan
-     *     BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>{ [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }</tt>.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
-        T               (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
-        ScanOp          scan_op)                        ///< [in] Binary scan functor 
-    {
-        if (ITEMS_PER_THREAD == 1)
-        {
-            InclusiveScan(input[0], output[0], scan_op);
-        }
-        else
-        {
-            // Reduce consecutive thread items in registers
-            T thread_prefix = internal::ThreadReduce(input, scan_op);
-
-            // Exclusive thread block-scan
-            ExclusiveScan(thread_prefix, thread_prefix, scan_op);
-
-            // Inclusive scan in registers with prefix as seed (first thread does not seed)
-            internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
-        }
-    }
-
-
-    /**
-     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - Supports non-commutative scan operators.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates an inclusive prefix max scan of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively compute the block-wide inclusive prefix max scan
-     *     int block_aggregate;
-     *     BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max(), block_aggregate);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is
-     * <tt>{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }</tt>.
-     * The corresponding output \p thread_data in those threads will be
-     * <tt>{ [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }</tt>.
-     * Furthermore the value \p 510 will be stored in \p block_aggregate for all threads.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename         ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
-        T               (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
-        ScanOp          scan_op,                        ///< [in] Binary scan functor 
-        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
-    {
-        if (ITEMS_PER_THREAD == 1)
-        {
-            InclusiveScan(input[0], output[0], scan_op, block_aggregate);
-        }
-        else
-        {
-            // Reduce consecutive thread items in registers
-            T thread_prefix = internal::ThreadReduce(input, scan_op);
-
-            // Exclusive thread block-scan (with no initial value)
-            ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_aggregate);
-
-            // Inclusive scan in registers with prefix as seed (first thread does not seed)
-            internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
-        }
-    }
-
-
-    /**
-     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
-     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
-     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
-     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
-     * - Supports non-commutative scan operators.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a single thread block that progressively
-     * computes an inclusive prefix max scan over multiple "tiles" of input using a
-     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
-     * of 128 integer items that are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * // A stateful callback functor that maintains a running prefix to be applied
-     * // during consecutive scan operations.
-     * struct BlockPrefixCallbackOp
-     * {
-     *     // Running prefix
-     *     int running_total;
-     *
-     *     // Constructor
-     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
-     *
-     *     // Callback operator to be entered by the first warp of threads in the block.
-     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
-     *     __device__ int operator()(int block_aggregate)
-     *     {
-     *         int old_prefix = running_total;
-     *         running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
-     *         return old_prefix;
-     *     }
-     * };
-     *
-     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
-     * {
-     *     // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
-     *     typedef cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE>   BlockLoad;
-     *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_TRANSPOSE>  BlockStore;
-     *     typedef cub::BlockScan<int, 128>                             BlockScan;
-     *
-     *     // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
-     *     __shared__ union {
-     *         typename BlockLoad::TempStorage     load;
-     *         typename BlockScan::TempStorage     scan;
-     *         typename BlockStore::TempStorage    store;
-     *     } temp_storage;
-     *
-     *     // Initialize running total
-     *     BlockPrefixCallbackOp prefix_op(0);
-     *
-     *     // Have the block iterate over segments of items
-     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
-     *     {
-     *         // Load a segment of consecutive items that are blocked across threads
-     *         int thread_data[4];
-     *         BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
-     *         CTA_SYNC();
-     *
-     *         // Collectively compute the block-wide inclusive prefix max scan
-     *         BlockScan(temp_storage.scan).InclusiveScan(
-     *             thread_data, thread_data, cub::Max(), prefix_op);
-     *         CTA_SYNC();
-     *
-     *         // Store scanned items to output segment
-     *         BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
-     *         CTA_SYNC();
-     *     }
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>0, -1, 2, -3, 4, -5, ...</tt>.
-     * The corresponding output for the first segment will be <tt>0, 0, 2, 2, 4, 4, ..., 510, 510</tt>.
-     * The output for the second segment will be <tt>512, 512, 514, 514, 516, 516, ..., 1022, 1022</tt>.
-     *
-     * \tparam ITEMS_PER_THREAD         <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam ScanOp                   <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
-     * \tparam BlockPrefixCallbackOp    <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        ScanOp,
-        typename        BlockPrefixCallbackOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T                       (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
-        T                       (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
-        ScanOp                  scan_op,                        ///< [in] Binary scan functor 
-        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
-    {
-        if (ITEMS_PER_THREAD == 1)
-        {
-            InclusiveScan(input[0], output[0], scan_op, block_prefix_callback_op);
-        }
-        else
-        {
-            // Reduce consecutive thread items in registers
-            T thread_prefix = internal::ThreadReduce(input, scan_op);
-
-            // Exclusive thread block-scan
-            ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_prefix_callback_op);
-
-            // Inclusive scan in registers with prefix as seed
-            internal::ThreadScanInclusive(input, output, scan_op, thread_prefix);
-        }
-    }
-
-    //@}  end member group
-
-
-};
-
-/**
- * \example example_block_scan.cu
- */
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/block/block_shuffle.cuh b/thrust/system/cuda/detail/cub/block/block_shuffle.cuh
deleted file mode 100644
index eb49fb6d4..000000000
--- a/thrust/system/cuda/detail/cub/block/block_shuffle.cuh
+++ /dev/null
@@ -1,305 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * The cub::BlockShuffle class provides [<em>collective</em>](index.html#sec0) methods for shuffling data partitioned across a CUDA thread block.
- */
-
-#pragma once
-
-#include "../util_arch.cuh"
-#include "../util_ptx.cuh"
-#include "../util_macro.cuh"
-#include "../util_type.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \brief The BlockShuffle class provides [<em>collective</em>](index.html#sec0) methods for shuffling data partitioned across a CUDA thread block.
- * \ingroup BlockModule
- *
- * \tparam T                    The data type to be exchanged.
- * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
- * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
- * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
- *
- * \par Overview
- * It is commonplace for blocks of threads to rearrange data items between
- * threads.  The BlockShuffle abstraction allows threads to efficiently shift items
- * either (a) up to their successor or (b) down to their predecessor.
- *
- */
-template <
-    typename            T,
-    int                 BLOCK_DIM_X,
-    int                 BLOCK_DIM_Y         = 1,
-    int                 BLOCK_DIM_Z         = 1,
-    int                 PTX_ARCH            = CUB_PTX_ARCH>
-class BlockShuffle
-{
-private:
-
-    /******************************************************************************
-     * Constants
-     ******************************************************************************/
-
-    enum
-    {
-        BLOCK_THREADS               = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-
-        LOG_WARP_THREADS            = CUB_LOG_WARP_THREADS(PTX_ARCH),
-        WARP_THREADS                = 1 << LOG_WARP_THREADS,
-        WARPS                       = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
-    };
-
-    /******************************************************************************
-     * Type definitions
-     ******************************************************************************/
-
-    /// Shared memory storage layout type (last element from each thread's input)
-    struct _TempStorage
-    {
-        T prev[BLOCK_THREADS];
-        T next[BLOCK_THREADS];
-    };
-
-
-public:
-
-    /// \smemstorage{BlockShuffle}
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-private:
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    /// Shared storage reference
-    _TempStorage &temp_storage;
-
-    /// Linear thread-id
-    unsigned int linear_tid;
-
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-    /// Internal storage allocator
-    __device__ __forceinline__ _TempStorage& PrivateStorage()
-    {
-        __shared__ _TempStorage private_storage;
-        return private_storage;
-    }
-
-
-public:
-
-    /******************************************************************//**
-     * \name Collective constructors
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
-     */
-    __device__ __forceinline__ BlockShuffle()
-    :
-        temp_storage(PrivateStorage()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.
-     */
-    __device__ __forceinline__ BlockShuffle(
-        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Shuffle movement
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Each <em>thread<sub>i</sub></em> obtains the \p input provided by <em>thread</em><sub><em>i</em>+<tt>distance</tt></sub>. The offset \p distance may be negative.
-     *
-     * \par
-     * - \smemreuse
-     */
-    __device__ __forceinline__ void Offset(
-        T   input,                  ///< [in] The input item from the calling thread (<em>thread<sub>i</sub></em>)
-        T&  output,                 ///< [out] The \p input item from the successor (or predecessor) thread <em>thread</em><sub><em>i</em>+<tt>distance</tt></sub> (may be aliased to \p input).  This value is only updated for for <em>thread<sub>i</sub></em> when 0 <= (<em>i</em> + \p distance) < <tt>BLOCK_THREADS-1</tt>
-        int distance = 1)           ///< [in] Offset distance (may be negative)
-    {
-        temp_storage[linear_tid].prev = input;
-
-        CTA_SYNC();
-
-        if ((linear_tid + distance >= 0) && (linear_tid + distance < BLOCK_THREADS))
-            output = temp_storage[linear_tid + distance].prev;
-    }
-
-
-    /**
-     * \brief Each <em>thread<sub>i</sub></em> obtains the \p input provided by <em>thread</em><sub><em>i</em>+<tt>distance</tt></sub>.
-     *
-     * \par
-     * - \smemreuse
-     */
-    __device__ __forceinline__ void Rotate(
-        T   input,                  ///< [in] The calling thread's input item
-        T&  output,                 ///< [out] The \p input item from thread <em>thread</em><sub>(<em>i</em>+<tt>distance></tt>)%<tt><BLOCK_THREADS></tt></sub> (may be aliased to \p input).  This value is not updated for <em>thread</em><sub>BLOCK_THREADS-1</sub>
-        unsigned int distance = 1)  ///< [in] Offset distance (0 < \p distance < <tt>BLOCK_THREADS</tt>)
-    {
-        temp_storage[linear_tid].prev = input;
-
-        CTA_SYNC();
-
-        unsigned int offset = threadIdx.x + distance;
-        if (offset >= BLOCK_THREADS)
-            offset -= BLOCK_THREADS;
-
-        output = temp_storage[offset].prev;
-    }
-
-
-    /**
-     * \brief The thread block rotates its [<em>blocked arrangement</em>](index.html#sec5sec3) of \p input items, shifting it up by one item
-     *
-     * \par
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     */
-    template <int ITEMS_PER_THREAD>
-    __device__ __forceinline__ void Up(
-        T (&input)[ITEMS_PER_THREAD],   ///< [in] The calling thread's input items
-        T (&prev)[ITEMS_PER_THREAD])    ///< [out] The corresponding predecessor items (may be aliased to \p input).  The item \p prev[0] is not updated for <em>thread</em><sub>0</sub>.
-    {
-        temp_storage[linear_tid].prev = input[ITEMS_PER_THREAD - 1];
-
-        CTA_SYNC();
-
-        #pragma unroll
-        for (int ITEM = ITEMS_PER_THREAD - 1; ITEM > 0; --ITEM)
-            prev[ITEM] = input[ITEM - 1];
-
-
-        if (linear_tid > 0)
-            prev[0] = temp_storage[linear_tid - 1].prev;
-    }
-
-
-    /**
-     * \brief The thread block rotates its [<em>blocked arrangement</em>](index.html#sec5sec3) of \p input items, shifting it up by one item.  All threads receive the \p input provided by <em>thread</em><sub><tt>BLOCK_THREADS-1</tt></sub>.
-     *
-     * \par
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     */
-    template <int ITEMS_PER_THREAD>
-    __device__ __forceinline__ void Up(
-        T (&input)[ITEMS_PER_THREAD],   ///< [in] The calling thread's input items
-        T (&prev)[ITEMS_PER_THREAD],    ///< [out] The corresponding predecessor items (may be aliased to \p input).  The item \p prev[0] is not updated for <em>thread</em><sub>0</sub>.
-        T &block_suffix)                ///< [out] The item \p input[ITEMS_PER_THREAD-1] from <em>thread</em><sub><tt>BLOCK_THREADS-1</tt></sub>, provided to all threads
-    {
-        Up(input, prev);
-        block_suffix = temp_storage[BLOCK_THREADS - 1].prev;
-    }
-
-
-    /**
-     * \brief The thread block rotates its [<em>blocked arrangement</em>](index.html#sec5sec3) of \p input items, shifting it down by one item
-     *
-     * \par
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     */
-    template <int ITEMS_PER_THREAD>
-    __device__ __forceinline__ void Down(
-        T (&input)[ITEMS_PER_THREAD],   ///< [in] The calling thread's input items
-        T (&prev)[ITEMS_PER_THREAD])    ///< [out] The corresponding predecessor items (may be aliased to \p input).  The value \p prev[0] is not updated for <em>thread</em><sub>BLOCK_THREADS-1</sub>.
-    {
-        temp_storage[linear_tid].prev = input[ITEMS_PER_THREAD - 1];
-
-        CTA_SYNC();
-
-        #pragma unroll
-        for (int ITEM = ITEMS_PER_THREAD - 1; ITEM > 0; --ITEM)
-            prev[ITEM] = input[ITEM - 1];
-
-        if (linear_tid > 0)
-            prev[0] = temp_storage[linear_tid - 1].prev;
-    }
-
-
-    /**
-     * \brief The thread block rotates its [<em>blocked arrangement</em>](index.html#sec5sec3) of input items, shifting it down by one item.  All threads receive \p input[0] provided by <em>thread</em><sub><tt>0</tt></sub>.
-     *
-     * \par
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     */
-    template <int ITEMS_PER_THREAD>
-    __device__ __forceinline__ void Down(
-        T (&input)[ITEMS_PER_THREAD],   ///< [in] The calling thread's input items
-        T (&prev)[ITEMS_PER_THREAD],    ///< [out] The corresponding predecessor items (may be aliased to \p input).  The value \p prev[0] is not updated for <em>thread</em><sub>BLOCK_THREADS-1</sub>.
-        T &block_prefix)                ///< [out] The item \p input[0] from <em>thread</em><sub><tt>0</tt></sub>, provided to all threads
-    {
-        Up(input, prev);
-        block_prefix = temp_storage[BLOCK_THREADS - 1].prev;
-    }
-
-    //@}  end member group
-
-
-};
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/block/block_store.cuh b/thrust/system/cuda/detail/cub/block/block_store.cuh
deleted file mode 100644
index c79c94f5b..000000000
--- a/thrust/system/cuda/detail/cub/block/block_store.cuh
+++ /dev/null
@@ -1,1000 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Operations for writing linear segments of data from the CUDA thread block
- */
-
-#pragma once
-
-#include <iterator>
-
-#include "block_exchange.cuh"
-#include "../util_ptx.cuh"
-#include "../util_macro.cuh"
-#include "../util_type.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \addtogroup UtilIo
- * @{
- */
-
-
-/******************************************************************//**
- * \name Blocked arrangement I/O (direct)
- *********************************************************************/
-//@{
-
-/**
- * \brief Store a blocked arrangement of items across a thread block into a linear segment of items.
- *
- * \blocked
- *
- * \tparam T                    <b>[inferred]</b> The data type to store.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
- */
-template <
-    typename            T,
-    int                 ITEMS_PER_THREAD,
-    typename            OutputIteratorT>
-__device__ __forceinline__ void StoreDirectBlocked(
-    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
-    T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
-{
-    OutputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD);
-
-    // Store directly in thread-blocked order
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        thread_itr[ITEM] = items[ITEM];
-    }
-}
-
-
-/**
- * \brief Store a blocked arrangement of items across a thread block into a linear segment of items, guarded by range
- *
- * \blocked
- *
- * \tparam T                    <b>[inferred]</b> The data type to store.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
- */
-template <
-    typename            T,
-    int                 ITEMS_PER_THREAD,
-    typename            OutputIteratorT>
-__device__ __forceinline__ void StoreDirectBlocked(
-    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
-    T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
-    int                 valid_items)                ///< [in] Number of valid items to write
-{
-    OutputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD);
-
-    // Store directly in thread-blocked order
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        if (ITEM + (linear_tid * ITEMS_PER_THREAD) < valid_items)
-        {
-            thread_itr[ITEM] = items[ITEM];
-        }
-    }
-}
-
-
-/**
- * \brief Store a blocked arrangement of items across a thread block into a linear segment of items.
- *
- * \blocked
- *
- * The output offset (\p block_ptr + \p block_offset) must be quad-item aligned,
- * which is the default starting offset returned by \p cudaMalloc()
- *
- * \par
- * The following conditions will prevent vectorization and storing will fall back to cub::BLOCK_STORE_DIRECT:
- *   - \p ITEMS_PER_THREAD is odd
- *   - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.)
- *
- * \tparam T                    <b>[inferred]</b> The data type to store.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- *
- */
-template <
-    typename            T,
-    int                 ITEMS_PER_THREAD>
-__device__ __forceinline__ void StoreDirectBlockedVectorized(
-    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    T                   *block_ptr,                 ///< [in] Input pointer for storing from
-    T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
-{
-    enum
-    {
-        // Maximum CUDA vector size is 4 elements
-        MAX_VEC_SIZE = CUB_MIN(4, ITEMS_PER_THREAD),
-
-        // Vector size must be a power of two and an even divisor of the items per thread
-        VEC_SIZE = ((((MAX_VEC_SIZE - 1) & MAX_VEC_SIZE) == 0) && ((ITEMS_PER_THREAD % MAX_VEC_SIZE) == 0)) ?
-            MAX_VEC_SIZE :
-            1,
-
-        VECTORS_PER_THREAD = ITEMS_PER_THREAD / VEC_SIZE,
-    };
-
-    // Vector type
-    typedef typename CubVector<T, VEC_SIZE>::Type Vector;
-
-    // Alias global pointer
-    Vector *block_ptr_vectors = reinterpret_cast<Vector*>(const_cast<T*>(block_ptr));
-
-    // Alias pointers (use "raw" array here which should get optimized away to prevent conservative PTXAS lmem spilling)
-    Vector raw_vector[VECTORS_PER_THREAD];
-    T *raw_items = reinterpret_cast<T*>(raw_vector);
-
-    // Copy
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        raw_items[ITEM] = items[ITEM];
-    }
-
-    // Direct-store using vector types
-    StoreDirectBlocked(linear_tid, block_ptr_vectors, raw_vector);
-}
-
-
-
-//@}  end member group
-/******************************************************************//**
- * \name Striped arrangement I/O (direct)
- *********************************************************************/
-//@{
-
-
-/**
- * \brief Store a striped arrangement of data across the thread block into a linear segment of items.
- *
- * \striped
- *
- * \tparam BLOCK_THREADS        The thread block size in threads
- * \tparam T                    <b>[inferred]</b> The data type to store.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
- */
-template <
-    int                 BLOCK_THREADS,
-    typename            T,
-    int                 ITEMS_PER_THREAD,
-    typename            OutputIteratorT>
-__device__ __forceinline__ void StoreDirectStriped(
-    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
-    T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
-{
-    OutputIteratorT thread_itr = block_itr + linear_tid;
-
-    // Store directly in striped order
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        thread_itr[(ITEM * BLOCK_THREADS)] = items[ITEM];
-    }
-}
-
-
-/**
- * \brief Store a striped arrangement of data across the thread block into a linear segment of items, guarded by range
- *
- * \striped
- *
- * \tparam BLOCK_THREADS        The thread block size in threads
- * \tparam T                    <b>[inferred]</b> The data type to store.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
- */
-template <
-    int                 BLOCK_THREADS,
-    typename            T,
-    int                 ITEMS_PER_THREAD,
-    typename            OutputIteratorT>
-__device__ __forceinline__ void StoreDirectStriped(
-    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
-    T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
-    int                 valid_items)                ///< [in] Number of valid items to write
-{
-    OutputIteratorT thread_itr = block_itr + linear_tid;
-
-    // Store directly in striped order
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        if ((ITEM * BLOCK_THREADS) + linear_tid < valid_items)
-        {
-            thread_itr[(ITEM * BLOCK_THREADS)] = items[ITEM];
-        }
-    }
-}
-
-
-
-//@}  end member group
-/******************************************************************//**
- * \name Warp-striped arrangement I/O (direct)
- *********************************************************************/
-//@{
-
-
-/**
- * \brief Store a warp-striped arrangement of data across the thread block into a linear segment of items.
- *
- * \warpstriped
- *
- * \par Usage Considerations
- * The number of threads in the thread block must be a multiple of the architecture's warp size.
- *
- * \tparam T                    <b>[inferred]</b> The data type to store.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
- */
-template <
-    typename            T,
-    int                 ITEMS_PER_THREAD,
-    typename            OutputIteratorT>
-__device__ __forceinline__ void StoreDirectWarpStriped(
-    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
-    T                   (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
-{
-    int tid         = linear_tid & (CUB_PTX_WARP_THREADS - 1);
-    int wid         = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
-    int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
-
-    OutputIteratorT thread_itr = block_itr + warp_offset + tid;
-
-    // Store directly in warp-striped order
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        thread_itr[(ITEM * CUB_PTX_WARP_THREADS)] = items[ITEM];
-    }
-}
-
-
-/**
- * \brief Store a warp-striped arrangement of data across the thread block into a linear segment of items, guarded by range
- *
- * \warpstriped
- *
- * \par Usage Considerations
- * The number of threads in the thread block must be a multiple of the architecture's warp size.
- *
- * \tparam T                    <b>[inferred]</b> The data type to store.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
- */
-template <
-    typename            T,
-    int                 ITEMS_PER_THREAD,
-    typename            OutputIteratorT>
-__device__ __forceinline__ void StoreDirectWarpStriped(
-    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
-    T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
-    int                 valid_items)                ///< [in] Number of valid items to write
-{
-    int tid         = linear_tid & (CUB_PTX_WARP_THREADS - 1);
-    int wid         = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
-    int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
-
-    OutputIteratorT thread_itr = block_itr + warp_offset + tid;
-
-    // Store directly in warp-striped order
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        if (warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS) < valid_items)
-        {
-            thread_itr[(ITEM * CUB_PTX_WARP_THREADS)] = items[ITEM];
-        }
-    }
-}
-
-
-//@}  end member group
-
-
-/** @} */       // end group UtilIo
-
-
-//-----------------------------------------------------------------------------
-// Generic BlockStore abstraction
-//-----------------------------------------------------------------------------
-
-/**
- * \brief cub::BlockStoreAlgorithm enumerates alternative algorithms for cub::BlockStore to write a blocked arrangement of items across a CUDA thread block to a linear segment of memory.
- */
-enum BlockStoreAlgorithm
-{
-    /**
-     * \par Overview
-     *
-     * A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is written
-     * directly to memory.
-     *
-     * \par Performance Considerations
-     * - The utilization of memory transactions (coalescing) decreases as the
-     *   access stride between threads increases (i.e., the number items per thread).
-     */
-    BLOCK_STORE_DIRECT,
-
-    /**
-     * \par Overview
-     *
-     * A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is written directly
-     * to memory using CUDA's built-in vectorized stores as a coalescing optimization.
-     * For example, <tt>st.global.v4.s32</tt> instructions will be generated
-     * when \p T = \p int and \p ITEMS_PER_THREAD % 4 == 0.
-     *
-     * \par Performance Considerations
-     * - The utilization of memory transactions (coalescing) remains high until the the
-     *   access stride between threads (i.e., the number items per thread) exceeds the
-     *   maximum vector store width (typically 4 items or 64B, whichever is lower).
-     * - The following conditions will prevent vectorization and writing will fall back to cub::BLOCK_STORE_DIRECT:
-     *   - \p ITEMS_PER_THREAD is odd
-     *   - The \p OutputIteratorT is not a simple pointer type
-     *   - The block output offset is not quadword-aligned
-     *   - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.)
-     */
-    BLOCK_STORE_VECTORIZE,
-
-    /**
-     * \par Overview
-     * A [<em>blocked arrangement</em>](index.html#sec5sec3) is locally
-     * transposed and then efficiently written to memory as a [<em>striped arrangement</em>](index.html#sec5sec3).
-     *
-     * \par Performance Considerations
-     * - The utilization of memory transactions (coalescing) remains high regardless
-     *   of items written per thread.
-     * - The local reordering incurs slightly longer latencies and throughput than the
-     *   direct cub::BLOCK_STORE_DIRECT and cub::BLOCK_STORE_VECTORIZE alternatives.
-     */
-    BLOCK_STORE_TRANSPOSE,
-
-    /**
-     * \par Overview
-     * A [<em>blocked arrangement</em>](index.html#sec5sec3) is locally
-     * transposed and then efficiently written to memory as a
-     * [<em>warp-striped arrangement</em>](index.html#sec5sec3)
-     *
-     * \par Usage Considerations
-     * - BLOCK_THREADS must be a multiple of WARP_THREADS
-     *
-     * \par Performance Considerations
-     * - The utilization of memory transactions (coalescing) remains high regardless
-     *   of items written per thread.
-     * - The local reordering incurs slightly longer latencies and throughput than the
-     *   direct cub::BLOCK_STORE_DIRECT and cub::BLOCK_STORE_VECTORIZE alternatives.
-     */
-    BLOCK_STORE_WARP_TRANSPOSE,
-
-    /**
-     * \par Overview
-     * A [<em>blocked arrangement</em>](index.html#sec5sec3) is locally
-     * transposed and then efficiently written to memory as a
-     * [<em>warp-striped arrangement</em>](index.html#sec5sec3)
-     * To reduce the shared memory requirement, only one warp's worth of shared
-     * memory is provisioned and is subsequently time-sliced among warps.
-     *
-     * \par Usage Considerations
-     * - BLOCK_THREADS must be a multiple of WARP_THREADS
-     *
-     * \par Performance Considerations
-     * - The utilization of memory transactions (coalescing) remains high regardless
-     *   of items written per thread.
-     * - Provisions less shared memory temporary storage, but incurs larger
-     *   latencies than the BLOCK_STORE_WARP_TRANSPOSE alternative.
-     */
-    BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED,
-
-};
-
-
-/**
- * \brief The BlockStore class provides [<em>collective</em>](index.html#sec0) data movement methods for writing a [<em>blocked arrangement</em>](index.html#sec5sec3) of items partitioned across a CUDA thread block to a linear segment of memory.  ![](block_store_logo.png)
- * \ingroup BlockModule
- * \ingroup UtilIo
- *
- * \tparam T                    The type of data to be written.
- * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
- * \tparam ITEMS_PER_THREAD     The number of consecutive items partitioned onto each thread.
- * \tparam ALGORITHM            <b>[optional]</b> cub::BlockStoreAlgorithm tuning policy enumeration.  default: cub::BLOCK_STORE_DIRECT.
- * \tparam WARP_TIME_SLICING    <b>[optional]</b> Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage). (default: false)
- * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
- * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
- *
- * \par Overview
- * - The BlockStore class provides a single data movement abstraction that can be specialized
- *   to implement different cub::BlockStoreAlgorithm strategies.  This facilitates different
- *   performance policies for different architectures, data types, granularity sizes, etc.
- * - BlockStore can be optionally specialized by different data movement strategies:
- *   -# <b>cub::BLOCK_STORE_DIRECT</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is written
- *      directly to memory. [More...](\ref cub::BlockStoreAlgorithm)
- *   -# <b>cub::BLOCK_STORE_VECTORIZE</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
- *      of data is written directly to memory using CUDA's built-in vectorized stores as a
- *      coalescing optimization.  [More...](\ref cub::BlockStoreAlgorithm)
- *   -# <b>cub::BLOCK_STORE_TRANSPOSE</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
- *      is locally transposed into a [<em>striped arrangement</em>](index.html#sec5sec3) which is
- *      then written to memory.  [More...](\ref cub::BlockStoreAlgorithm)
- *   -# <b>cub::BLOCK_STORE_WARP_TRANSPOSE</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
- *      is locally transposed into a [<em>warp-striped arrangement</em>](index.html#sec5sec3) which is
- *      then written to memory.  [More...](\ref cub::BlockStoreAlgorithm)
- * - \rowmajor
- *
- * \par A Simple Example
- * \blockcollective{BlockStore}
- * \par
- * The code snippet below illustrates the storing of a "blocked" arrangement
- * of 512 integers across 128 threads (where each thread owns 4 consecutive items)
- * into a linear segment of memory.  The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE,
- * meaning items are locally reordered among threads so that memory references will be
- * efficiently coalesced using a warp-striped access pattern.
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/block/block_store.cuh>
- *
- * __global__ void ExampleKernel(int *d_data, ...)
- * {
- *     // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each
- *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_WARP_TRANSPOSE> BlockStore;
- *
- *     // Allocate shared memory for BlockStore
- *     __shared__ typename BlockStore::TempStorage temp_storage;
- *
- *     // Obtain a segment of consecutive items that are blocked across threads
- *     int thread_data[4];
- *     ...
- *
- *     // Store items to linear memory
- *     int thread_data[4];
- *     BlockStore(temp_storage).Store(d_data, thread_data);
- *
- * \endcode
- * \par
- * Suppose the set of \p thread_data across the block of threads is
- * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
- * The output \p d_data will be <tt>0, 1, 2, 3, 4, 5, ...</tt>.
- *
- */
-template <
-    typename                T,
-    int                     BLOCK_DIM_X,
-    int                     ITEMS_PER_THREAD,
-    BlockStoreAlgorithm     ALGORITHM           = BLOCK_STORE_DIRECT,
-    int                     BLOCK_DIM_Y         = 1,
-    int                     BLOCK_DIM_Z         = 1,
-    int                     PTX_ARCH            = CUB_PTX_ARCH>
-class BlockStore
-{
-private:
-    /******************************************************************************
-     * Constants and typed definitions
-     ******************************************************************************/
-
-    /// Constants
-    enum
-    {
-        /// The thread block size in threads
-        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-    };
-
-
-    /******************************************************************************
-     * Algorithmic variants
-     ******************************************************************************/
-
-    /// Store helper
-    template <BlockStoreAlgorithm _POLICY, int DUMMY>
-    struct StoreInternal;
-
-
-    /**
-     * BLOCK_STORE_DIRECT specialization of store helper
-     */
-    template <int DUMMY>
-    struct StoreInternal<BLOCK_STORE_DIRECT, DUMMY>
-    {
-        /// Shared memory storage layout type
-        typedef NullType TempStorage;
-
-        /// Linear thread-id
-        int linear_tid;
-
-        /// Constructor
-        __device__ __forceinline__ StoreInternal(
-            TempStorage &/*temp_storage*/,
-            int linear_tid)
-        :
-            linear_tid(linear_tid)
-        {}
-
-        /// Store items into a linear segment of memory
-        template <typename OutputIteratorT>
-        __device__ __forceinline__ void Store(
-            OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
-            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
-        {
-            StoreDirectBlocked(linear_tid, block_itr, items);
-        }
-
-        /// Store items into a linear segment of memory, guarded by range
-        template <typename OutputIteratorT>
-        __device__ __forceinline__ void Store(
-            OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
-            T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
-            int                 valid_items)                ///< [in] Number of valid items to write
-        {
-            StoreDirectBlocked(linear_tid, block_itr, items, valid_items);
-        }
-    };
-
-
-    /**
-     * BLOCK_STORE_VECTORIZE specialization of store helper
-     */
-    template <int DUMMY>
-    struct StoreInternal<BLOCK_STORE_VECTORIZE, DUMMY>
-    {
-        /// Shared memory storage layout type
-        typedef NullType TempStorage;
-
-        /// Linear thread-id
-        int linear_tid;
-
-        /// Constructor
-        __device__ __forceinline__ StoreInternal(
-            TempStorage &/*temp_storage*/,
-            int linear_tid)
-        :
-            linear_tid(linear_tid)
-        {}
-
-        /// Store items into a linear segment of memory, specialized for native pointer types (attempts vectorization)
-        __device__ __forceinline__ void Store(
-            T                   *block_ptr,                 ///< [in] The thread block's base output iterator for storing to
-            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
-        {
-            StoreDirectBlockedVectorized(linear_tid, block_ptr, items);
-        }
-
-        /// Store items into a linear segment of memory, specialized for opaque input iterators (skips vectorization)
-        template <typename OutputIteratorT>
-        __device__ __forceinline__ void Store(
-            OutputIteratorT    block_itr,                  ///< [in] The thread block's base output iterator for storing to
-            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
-        {
-            StoreDirectBlocked(linear_tid, block_itr, items);
-        }
-
-        /// Store items into a linear segment of memory, guarded by range
-        template <typename OutputIteratorT>
-        __device__ __forceinline__ void Store(
-            OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
-            T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
-            int                 valid_items)                ///< [in] Number of valid items to write
-        {
-            StoreDirectBlocked(linear_tid, block_itr, items, valid_items);
-        }
-    };
-
-
-    /**
-     * BLOCK_STORE_TRANSPOSE specialization of store helper
-     */
-    template <int DUMMY>
-    struct StoreInternal<BLOCK_STORE_TRANSPOSE, DUMMY>
-    {
-        // BlockExchange utility type for keys
-        typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
-
-        /// Shared memory storage layout type
-        struct _TempStorage : BlockExchange::TempStorage
-        {
-            /// Temporary storage for partially-full block guard
-            volatile int valid_items;
-        };
-
-        /// Alias wrapper allowing storage to be unioned
-        struct TempStorage : Uninitialized<_TempStorage> {};
-
-        /// Thread reference to shared storage
-        _TempStorage &temp_storage;
-
-        /// Linear thread-id
-        int linear_tid;
-
-        /// Constructor
-        __device__ __forceinline__ StoreInternal(
-            TempStorage &temp_storage,
-            int linear_tid)
-        :
-            temp_storage(temp_storage.Alias()),
-            linear_tid(linear_tid)
-        {}
-
-        /// Store items into a linear segment of memory
-        template <typename OutputIteratorT>
-        __device__ __forceinline__ void Store(
-            OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
-            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
-        {
-            BlockExchange(temp_storage).BlockedToStriped(items);
-            StoreDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items);
-        }
-
-        /// Store items into a linear segment of memory, guarded by range
-        template <typename OutputIteratorT>
-        __device__ __forceinline__ void Store(
-            OutputIteratorT   block_itr,                  ///< [in] The thread block's base output iterator for storing to
-            T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
-            int                 valid_items)                ///< [in] Number of valid items to write
-        {
-            BlockExchange(temp_storage).BlockedToStriped(items);
-            if (linear_tid == 0)
-                temp_storage.valid_items = valid_items;     // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
-            CTA_SYNC();
-            StoreDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, temp_storage.valid_items);
-        }
-    };
-
-
-    /**
-     * BLOCK_STORE_WARP_TRANSPOSE specialization of store helper
-     */
-    template <int DUMMY>
-    struct StoreInternal<BLOCK_STORE_WARP_TRANSPOSE, DUMMY>
-    {
-        enum
-        {
-            WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
-        };
-
-        // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
-        CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
-
-        // BlockExchange utility type for keys
-        typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
-
-        /// Shared memory storage layout type
-        struct _TempStorage : BlockExchange::TempStorage
-        {
-            /// Temporary storage for partially-full block guard
-            volatile int valid_items;
-        };
-
-        /// Alias wrapper allowing storage to be unioned
-        struct TempStorage : Uninitialized<_TempStorage> {};
-
-        /// Thread reference to shared storage
-        _TempStorage &temp_storage;
-
-        /// Linear thread-id
-        int linear_tid;
-
-        /// Constructor
-        __device__ __forceinline__ StoreInternal(
-            TempStorage &temp_storage,
-            int linear_tid)
-        :
-            temp_storage(temp_storage.Alias()),
-            linear_tid(linear_tid)
-        {}
-
-        /// Store items into a linear segment of memory
-        template <typename OutputIteratorT>
-        __device__ __forceinline__ void Store(
-            OutputIteratorT   block_itr,                    ///< [in] The thread block's base output iterator for storing to
-            T                 (&items)[ITEMS_PER_THREAD])   ///< [in] Data to store
-        {
-            BlockExchange(temp_storage).BlockedToWarpStriped(items);
-            StoreDirectWarpStriped(linear_tid, block_itr, items);
-        }
-
-        /// Store items into a linear segment of memory, guarded by range
-        template <typename OutputIteratorT>
-        __device__ __forceinline__ void Store(
-            OutputIteratorT   block_itr,                    ///< [in] The thread block's base output iterator for storing to
-            T                 (&items)[ITEMS_PER_THREAD],   ///< [in] Data to store
-            int               valid_items)                  ///< [in] Number of valid items to write
-        {
-            BlockExchange(temp_storage).BlockedToWarpStriped(items);
-            if (linear_tid == 0)
-                temp_storage.valid_items = valid_items;     // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
-            CTA_SYNC();
-            StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items);
-        }
-    };
-
-
-    /**
-     * BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED specialization of store helper
-     */
-    template <int DUMMY>
-    struct StoreInternal<BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, DUMMY>
-    {
-        enum
-        {
-            WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
-        };
-
-        // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
-        CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
-
-        // BlockExchange utility type for keys
-        typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, true, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
-
-        /// Shared memory storage layout type
-        struct _TempStorage : BlockExchange::TempStorage
-        {
-            /// Temporary storage for partially-full block guard
-            volatile int valid_items;
-        };
-
-        /// Alias wrapper allowing storage to be unioned
-        struct TempStorage : Uninitialized<_TempStorage> {};
-
-        /// Thread reference to shared storage
-        _TempStorage &temp_storage;
-
-        /// Linear thread-id
-        int linear_tid;
-
-        /// Constructor
-        __device__ __forceinline__ StoreInternal(
-            TempStorage &temp_storage,
-            int linear_tid)
-        :
-            temp_storage(temp_storage.Alias()),
-            linear_tid(linear_tid)
-        {}
-
-        /// Store items into a linear segment of memory
-        template <typename OutputIteratorT>
-        __device__ __forceinline__ void Store(
-            OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
-            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
-        {
-            BlockExchange(temp_storage).BlockedToWarpStriped(items);
-            StoreDirectWarpStriped(linear_tid, block_itr, items);
-        }
-
-        /// Store items into a linear segment of memory, guarded by range
-        template <typename OutputIteratorT>
-        __device__ __forceinline__ void Store(
-            OutputIteratorT   block_itr,                  ///< [in] The thread block's base output iterator for storing to
-            T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
-            int                 valid_items)                ///< [in] Number of valid items to write
-        {
-            BlockExchange(temp_storage).BlockedToWarpStriped(items);
-            if (linear_tid == 0)
-                temp_storage.valid_items = valid_items;     // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
-            CTA_SYNC();
-            StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items);
-        }
-    };
-
-    /******************************************************************************
-     * Type definitions
-     ******************************************************************************/
-
-    /// Internal load implementation to use
-    typedef StoreInternal<ALGORITHM, 0> InternalStore;
-
-
-    /// Shared memory storage layout type
-    typedef typename InternalStore::TempStorage _TempStorage;
-
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-    /// Internal storage allocator
-    __device__ __forceinline__ _TempStorage& PrivateStorage()
-    {
-        __shared__ _TempStorage private_storage;
-        return private_storage;
-    }
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    /// Thread reference to shared storage
-    _TempStorage &temp_storage;
-
-    /// Linear thread-id
-    int linear_tid;
-
-public:
-
-
-    /// \smemstorage{BlockStore}
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    /******************************************************************//**
-     * \name Collective constructors
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
-     */
-    __device__ __forceinline__ BlockStore()
-    :
-        temp_storage(PrivateStorage()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.
-     */
-    __device__ __forceinline__ BlockStore(
-        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Data movement
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Store items into a linear segment of memory.
-     *
-     * \par
-     * - \blocked
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates the storing of a "blocked" arrangement
-     * of 512 integers across 128 threads (where each thread owns 4 consecutive items)
-     * into a linear segment of memory.  The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE,
-     * meaning items are locally reordered among threads so that memory references will be
-     * efficiently coalesced using a warp-striped access pattern.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_store.cuh>
-     *
-     * __global__ void ExampleKernel(int *d_data, ...)
-     * {
-     *     // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each
-     *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_WARP_TRANSPOSE> BlockStore;
-     *
-     *     // Allocate shared memory for BlockStore
-     *     __shared__ typename BlockStore::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Store items to linear memory
-     *     int thread_data[4];
-     *     BlockStore(temp_storage).Store(d_data, thread_data);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of \p thread_data across the block of threads is
-     * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
-     * The output \p d_data will be <tt>0, 1, 2, 3, 4, 5, ...</tt>.
-     *
-     */
-    template <typename OutputIteratorT>
-    __device__ __forceinline__ void Store(
-        OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
-        T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
-    {
-        InternalStore(temp_storage, linear_tid).Store(block_itr, items);
-    }
-
-    /**
-     * \brief Store items into a linear segment of memory, guarded by range.
-     *
-     * \par
-     * - \blocked
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates the guarded storing of a "blocked" arrangement
-     * of 512 integers across 128 threads (where each thread owns 4 consecutive items)
-     * into a linear segment of memory.  The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE,
-     * meaning items are locally reordered among threads so that memory references will be
-     * efficiently coalesced using a warp-striped access pattern.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_store.cuh>
-     *
-     * __global__ void ExampleKernel(int *d_data, int valid_items, ...)
-     * {
-     *     // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each
-     *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_WARP_TRANSPOSE> BlockStore;
-     *
-     *     // Allocate shared memory for BlockStore
-     *     __shared__ typename BlockStore::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Store items to linear memory
-     *     int thread_data[4];
-     *     BlockStore(temp_storage).Store(d_data, thread_data, valid_items);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of \p thread_data across the block of threads is
-     * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt> and \p valid_items is \p 5.
-     * The output \p d_data will be <tt>0, 1, 2, 3, 4, ?, ?, ?, ...</tt>, with
-     * only the first two threads being unmasked to store portions of valid data.
-     *
-     */
-    template <typename OutputIteratorT>
-    __device__ __forceinline__ void Store(
-        OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
-        T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
-        int                 valid_items)                ///< [in] Number of valid items to write
-    {
-        InternalStore(temp_storage, linear_tid).Store(block_itr, items, valid_items);
-    }
-};
-
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/block/specializations/block_histogram_atomic.cuh b/thrust/system/cuda/detail/cub/block/specializations/block_histogram_atomic.cuh
deleted file mode 100644
index c971f000a..000000000
--- a/thrust/system/cuda/detail/cub/block/specializations/block_histogram_atomic.cuh
+++ /dev/null
@@ -1,82 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * The cub::BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
- */
-
-#pragma once
-
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \brief The BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
- */
-template <int BINS>
-struct BlockHistogramAtomic
-{
-    /// Shared memory storage layout type
-    struct TempStorage {};
-
-
-    /// Constructor
-    __device__ __forceinline__ BlockHistogramAtomic(
-        TempStorage &temp_storage)
-    {}
-
-
-    /// Composite data onto an existing histogram
-    template <
-        typename            T,
-        typename            CounterT,     
-        int                 ITEMS_PER_THREAD>
-    __device__ __forceinline__ void Composite(
-        T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
-        CounterT             histogram[BINS])                 ///< [out] Reference to shared/device-accessible memory histogram
-    {
-        // Update histogram
-        #pragma unroll
-        for (int i = 0; i < ITEMS_PER_THREAD; ++i)
-        {
-              atomicAdd(histogram + items[i], 1);
-        }
-    }
-
-};
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/block/specializations/block_histogram_sort.cuh b/thrust/system/cuda/detail/cub/block/specializations/block_histogram_sort.cuh
deleted file mode 100644
index cdbbefd40..000000000
--- a/thrust/system/cuda/detail/cub/block/specializations/block_histogram_sort.cuh
+++ /dev/null
@@ -1,226 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * The cub::BlockHistogramSort class provides sorting-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
- */
-
-#pragma once
-
-#include "../../block/block_radix_sort.cuh"
-#include "../../block/block_discontinuity.cuh"
-#include "../../util_ptx.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-
-/**
- * \brief The BlockHistogramSort class provides sorting-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
- */
-template <
-    typename    T,                  ///< Sample type
-    int         BLOCK_DIM_X,        ///< The thread block length in threads along the X dimension
-    int         ITEMS_PER_THREAD,   ///< The number of samples per thread
-    int         BINS,               ///< The number of bins into which histogram samples may fall
-    int         BLOCK_DIM_Y,        ///< The thread block length in threads along the Y dimension
-    int         BLOCK_DIM_Z,        ///< The thread block length in threads along the Z dimension
-    int         PTX_ARCH>           ///< The PTX compute capability for which to to specialize this collective
-struct BlockHistogramSort
-{
-    /// Constants
-    enum
-    {
-        /// The thread block size in threads
-        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-    };
-
-    // Parameterize BlockRadixSort type for our thread block
-    typedef BlockRadixSort<
-            T,
-            BLOCK_DIM_X,
-            ITEMS_PER_THREAD,
-            NullType,
-            4,
-            (PTX_ARCH >= 350) ? true : false,
-            BLOCK_SCAN_WARP_SCANS,
-            cudaSharedMemBankSizeFourByte,
-            BLOCK_DIM_Y,
-            BLOCK_DIM_Z,
-            PTX_ARCH>
-        BlockRadixSortT;
-
-    // Parameterize BlockDiscontinuity type for our thread block
-    typedef BlockDiscontinuity<
-            T,
-            BLOCK_DIM_X,
-            BLOCK_DIM_Y,
-            BLOCK_DIM_Z,
-            PTX_ARCH>
-        BlockDiscontinuityT;
-
-    /// Shared memory
-    union _TempStorage
-    {
-        // Storage for sorting bin values
-        typename BlockRadixSortT::TempStorage sort;
-
-        struct
-        {
-            // Storage for detecting discontinuities in the tile of sorted bin values
-            typename BlockDiscontinuityT::TempStorage flag;
-
-            // Storage for noting begin/end offsets of bin runs in the tile of sorted bin values
-            unsigned int run_begin[BINS];
-            unsigned int run_end[BINS];
-        };
-    };
-
-
-    /// Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    // Thread fields
-    _TempStorage &temp_storage;
-    unsigned int linear_tid;
-
-
-    /// Constructor
-    __device__ __forceinline__ BlockHistogramSort(
-        TempStorage     &temp_storage)
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    // Discontinuity functor
-    struct DiscontinuityOp
-    {
-        // Reference to temp_storage
-        _TempStorage &temp_storage;
-
-        // Constructor
-        __device__ __forceinline__ DiscontinuityOp(_TempStorage &temp_storage) :
-            temp_storage(temp_storage)
-        {}
-
-        // Discontinuity predicate
-        __device__ __forceinline__ bool operator()(const T &a, const T &b, int b_index)
-        {
-            if (a != b)
-            {
-                // Note the begin/end offsets in shared storage
-                temp_storage.run_begin[b] = b_index;
-                temp_storage.run_end[a] = b_index;
-
-                return true;
-            }
-            else
-            {
-                return false;
-            }
-        }
-    };
-
-
-    // Composite data onto an existing histogram
-    template <
-        typename            CounterT     >
-    __device__ __forceinline__ void Composite(
-        T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
-        CounterT            histogram[BINS])                 ///< [out] Reference to shared/device-accessible memory histogram
-    {
-        enum { TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD };
-
-        // Sort bytes in blocked arrangement
-        BlockRadixSortT(temp_storage.sort).Sort(items);
-
-        CTA_SYNC();
-
-        // Initialize the shared memory's run_begin and run_end for each bin
-        int histo_offset = 0;
-
-        #pragma unroll
-        for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
-        {
-            temp_storage.run_begin[histo_offset + linear_tid] = TILE_SIZE;
-            temp_storage.run_end[histo_offset + linear_tid] = TILE_SIZE;
-        }
-        // Finish up with guarded initialization if necessary
-        if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS))
-        {
-            temp_storage.run_begin[histo_offset + linear_tid] = TILE_SIZE;
-            temp_storage.run_end[histo_offset + linear_tid] = TILE_SIZE;
-        }
-
-        CTA_SYNC();
-
-        int flags[ITEMS_PER_THREAD];    // unused
-
-        // Compute head flags to demarcate contiguous runs of the same bin in the sorted tile
-        DiscontinuityOp flag_op(temp_storage);
-        BlockDiscontinuityT(temp_storage.flag).FlagHeads(flags, items, flag_op);
-
-        // Update begin for first item
-        if (linear_tid == 0) temp_storage.run_begin[items[0]] = 0;
-
-        CTA_SYNC();
-
-        // Composite into histogram
-        histo_offset = 0;
-
-        #pragma unroll
-        for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
-        {
-            int thread_offset = histo_offset + linear_tid;
-            CounterT      count = temp_storage.run_end[thread_offset] - temp_storage.run_begin[thread_offset];
-            histogram[thread_offset] += count;
-        }
-
-        // Finish up with guarded composition if necessary
-        if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS))
-        {
-            int thread_offset = histo_offset + linear_tid;
-            CounterT      count = temp_storage.run_end[thread_offset] - temp_storage.run_begin[thread_offset];
-            histogram[thread_offset] += count;
-        }
-    }
-
-};
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking.cuh b/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking.cuh
deleted file mode 100644
index 612a5acf7..000000000
--- a/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking.cuh
+++ /dev/null
@@ -1,222 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread block.  Supports non-commutative reduction operators.
- */
-
-#pragma once
-
-#include "../../block/block_raking_layout.cuh"
-#include "../../warp/warp_reduce.cuh"
-#include "../../thread/thread_reduce.cuh"
-#include "../../util_ptx.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \brief BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread block.  Supports non-commutative reduction operators.
- *
- * Supports non-commutative binary reduction operators.  Unlike commutative
- * reduction operators (e.g., addition), the application of a non-commutative
- * reduction operator (e.g, string concatenation) across a sequence of inputs must
- * honor the relative ordering of items and partial reductions when applying the
- * reduction operator.
- *
- * Compared to the implementation of BlockReduceRaking (which does not support
- * non-commutative operators), this implementation requires a few extra
- * rounds of inter-thread communication.
- */
-template <
-    typename    T,              ///< Data type being reduced
-    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
-    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
-    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
-    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
-struct BlockReduceRaking
-{
-    /// Constants
-    enum
-    {
-        /// The thread block size in threads
-        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-    };
-
-    /// Layout type for padded thread block raking grid
-    typedef BlockRakingLayout<T, BLOCK_THREADS, PTX_ARCH> BlockRakingLayout;
-
-    ///  WarpReduce utility type
-    typedef typename WarpReduce<T, BlockRakingLayout::RAKING_THREADS, PTX_ARCH>::InternalWarpReduce WarpReduce;
-
-    /// Constants
-    enum
-    {
-        /// Number of raking threads
-        RAKING_THREADS = BlockRakingLayout::RAKING_THREADS,
-
-        /// Number of raking elements per warp synchronous raking thread
-        SEGMENT_LENGTH = BlockRakingLayout::SEGMENT_LENGTH,
-
-        /// Cooperative work can be entirely warp synchronous
-        WARP_SYNCHRONOUS = (RAKING_THREADS == BLOCK_THREADS),
-
-        /// Whether or not warp-synchronous reduction should be unguarded (i.e., the warp-reduction elements is a power of two
-        WARP_SYNCHRONOUS_UNGUARDED = PowerOfTwo<RAKING_THREADS>::VALUE,
-
-        /// Whether or not accesses into smem are unguarded
-        RAKING_UNGUARDED = BlockRakingLayout::UNGUARDED,
-
-    };
-
-
-    /// Shared memory storage layout type
-    union _TempStorage
-    {
-        typename WarpReduce::TempStorage            warp_storage;        ///< Storage for warp-synchronous reduction
-        typename BlockRakingLayout::TempStorage     raking_grid;         ///< Padded thread block raking grid
-    };
-
-
-    /// Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    // Thread fields
-    _TempStorage &temp_storage;
-    unsigned int linear_tid;
-
-
-    /// Constructor
-    __device__ __forceinline__ BlockReduceRaking(
-        TempStorage &temp_storage)
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    template <bool IS_FULL_TILE, typename ReductionOp, int ITERATION>
-    __device__ __forceinline__ T RakingReduction(
-        ReductionOp                 reduction_op,       ///< [in] Binary scan operator
-        T                           *raking_segment,
-        T                           partial,            ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
-        int                         num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
-        Int2Type<ITERATION>         /*iteration*/)
-    {
-        // Update partial if addend is in range
-        if ((IS_FULL_TILE && RAKING_UNGUARDED) || ((linear_tid * SEGMENT_LENGTH) + ITERATION < num_valid))
-        {
-            T addend = raking_segment[ITERATION];
-            partial = reduction_op(partial, addend);
-        }
-        return RakingReduction<IS_FULL_TILE>(reduction_op, raking_segment, partial, num_valid, Int2Type<ITERATION + 1>());
-    }
-
-    template <bool IS_FULL_TILE, typename ReductionOp>
-    __device__ __forceinline__ T RakingReduction(
-        ReductionOp                 /*reduction_op*/,   ///< [in] Binary scan operator
-        T                           * /*raking_segment*/,
-        T                           partial,            ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
-        int                         /*num_valid*/,      ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
-        Int2Type<SEGMENT_LENGTH>    /*iteration*/)
-    {
-        return partial;
-    }
-
-
-
-    /// Computes a thread block-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
-    template <
-        bool                IS_FULL_TILE,
-        typename            ReductionOp>
-    __device__ __forceinline__ T Reduce(
-        T                   partial,            ///< [in] Calling thread's input partial reductions
-        int                 num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
-        ReductionOp         reduction_op)       ///< [in] Binary reduction operator
-    {
-        if (WARP_SYNCHRONOUS)
-        {
-            // Short-circuit directly to warp synchronous reduction (unguarded if active threads is a power-of-two)
-            partial = WarpReduce(temp_storage.warp_storage).template Reduce<IS_FULL_TILE, SEGMENT_LENGTH>(
-                partial,
-                num_valid,
-                reduction_op);
-        }
-        else
-        {
-            // Place partial into shared memory grid.
-            *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid) = partial;
-
-            CTA_SYNC();
-
-            // Reduce parallelism to one warp
-            if (linear_tid < RAKING_THREADS)
-            {
-                // Raking reduction in grid
-                T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
-                partial = raking_segment[0];
-
-                partial = RakingReduction<IS_FULL_TILE>(reduction_op, raking_segment, partial, num_valid, Int2Type<1>());
-
-                partial = WarpReduce(temp_storage.warp_storage).template Reduce<IS_FULL_TILE && RAKING_UNGUARDED, SEGMENT_LENGTH>(
-                    partial,
-                    num_valid,
-                    reduction_op);
-
-            }
-        }
-
-        return partial;
-    }
-
-
-    /// Computes a thread block-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
-    template <bool IS_FULL_TILE>
-    __device__ __forceinline__ T Sum(
-        T                   partial,            ///< [in] Calling thread's input partial reductions
-        int                 num_valid)          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
-    {
-        cub::Sum reduction_op;
-
-        return Reduce<IS_FULL_TILE>(partial, num_valid, reduction_op);
-    }
-
-
-
-};
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking_commutative_only.cuh b/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking_commutative_only.cuh
deleted file mode 100644
index 012c71d4e..000000000
--- a/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking_commutative_only.cuh
+++ /dev/null
@@ -1,199 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction across a CUDA thread block.  Does not support non-commutative reduction operators.
- */
-
-#pragma once
-
-#include "block_reduce_raking.cuh"
-#include "../../warp/warp_reduce.cuh"
-#include "../../thread/thread_reduce.cuh"
-#include "../../util_ptx.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \brief BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction across a CUDA thread block.  Does not support non-commutative reduction operators.  Does not support block sizes that are not a multiple of the warp size.
- */
-template <
-    typename    T,              ///< Data type being reduced
-    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
-    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
-    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
-    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
-struct BlockReduceRakingCommutativeOnly
-{
-    /// Constants
-    enum
-    {
-        /// The thread block size in threads
-        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-    };
-
-    // The fall-back implementation to use when BLOCK_THREADS is not a multiple of the warp size or not all threads have valid values
-    typedef BlockReduceRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> FallBack;
-
-    /// Constants
-    enum
-    {
-        /// Number of warp threads
-        WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
-
-        /// Whether or not to use fall-back
-        USE_FALLBACK = ((BLOCK_THREADS % WARP_THREADS != 0) || (BLOCK_THREADS <= WARP_THREADS)),
-
-        /// Number of raking threads
-        RAKING_THREADS = WARP_THREADS,
-
-        /// Number of threads actually sharing items with the raking threads
-        SHARING_THREADS = CUB_MAX(1, BLOCK_THREADS - RAKING_THREADS),
-
-        /// Number of raking elements per warp synchronous raking thread
-        SEGMENT_LENGTH = SHARING_THREADS / WARP_THREADS,
-    };
-
-    ///  WarpReduce utility type
-    typedef WarpReduce<T, RAKING_THREADS, PTX_ARCH> WarpReduce;
-
-    /// Layout type for padded thread block raking grid
-    typedef BlockRakingLayout<T, SHARING_THREADS, PTX_ARCH> BlockRakingLayout;
-
-    /// Shared memory storage layout type
-    union _TempStorage
-    {
-        struct
-        {
-            typename WarpReduce::TempStorage        warp_storage;        ///< Storage for warp-synchronous reduction
-            typename BlockRakingLayout::TempStorage raking_grid;         ///< Padded thread block raking grid
-        };
-        typename FallBack::TempStorage              fallback_storage;    ///< Fall-back storage for non-commutative block scan
-    };
-
-
-    /// Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    // Thread fields
-    _TempStorage &temp_storage;
-    unsigned int linear_tid;
-
-
-    /// Constructor
-    __device__ __forceinline__ BlockReduceRakingCommutativeOnly(
-        TempStorage &temp_storage)
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    /// Computes a thread block-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
-    template <bool FULL_TILE>
-    __device__ __forceinline__ T Sum(
-        T                   partial,            ///< [in] Calling thread's input partial reductions
-        int                 num_valid)          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
-    {
-        if (USE_FALLBACK || !FULL_TILE)
-        {
-            return FallBack(temp_storage.fallback_storage).template Sum<FULL_TILE>(partial, num_valid);
-        }
-        else
-        {
-            // Place partial into shared memory grid
-            if (linear_tid >= RAKING_THREADS)
-                *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid - RAKING_THREADS) = partial;
-
-            CTA_SYNC();
-
-            // Reduce parallelism to one warp
-            if (linear_tid < RAKING_THREADS)
-            {
-                // Raking reduction in grid
-                T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
-                partial = internal::ThreadReduce<SEGMENT_LENGTH>(raking_segment, cub::Sum(), partial);
-
-                // Warpscan
-                partial = WarpReduce(temp_storage.warp_storage).Sum(partial);
-            }
-        }
-
-        return partial;
-    }
-
-
-    /// Computes a thread block-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
-    template <
-        bool                FULL_TILE,
-        typename            ReductionOp>
-    __device__ __forceinline__ T Reduce(
-        T                   partial,            ///< [in] Calling thread's input partial reductions
-        int                 num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
-        ReductionOp         reduction_op)       ///< [in] Binary reduction operator
-    {
-        if (USE_FALLBACK || !FULL_TILE)
-        {
-            return FallBack(temp_storage.fallback_storage).template Reduce<FULL_TILE>(partial, num_valid, reduction_op);
-        }
-        else
-        {
-            // Place partial into shared memory grid
-            if (linear_tid >= RAKING_THREADS)
-                *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid - RAKING_THREADS) = partial;
-
-            CTA_SYNC();
-
-            // Reduce parallelism to one warp
-            if (linear_tid < RAKING_THREADS)
-            {
-                // Raking reduction in grid
-                T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
-                partial = internal::ThreadReduce<SEGMENT_LENGTH>(raking_segment, reduction_op, partial);
-
-                // Warpscan
-                partial = WarpReduce(temp_storage.warp_storage).Reduce(partial, reduction_op);
-            }
-        }
-
-        return partial;
-    }
-
-};
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/block/specializations/block_reduce_warp_reductions.cuh b/thrust/system/cuda/detail/cub/block/specializations/block_reduce_warp_reductions.cuh
deleted file mode 100644
index 2e8be1c3d..000000000
--- a/thrust/system/cuda/detail/cub/block/specializations/block_reduce_warp_reductions.cuh
+++ /dev/null
@@ -1,222 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction across a CUDA thread block.  Supports non-commutative reduction operators.
- */
-
-#pragma once
-
-#include "../../warp/warp_reduce.cuh"
-#include "../../util_ptx.cuh"
-#include "../../util_arch.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \brief BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction across a CUDA thread block.  Supports non-commutative reduction operators.
- */
-template <
-    typename    T,              ///< Data type being reduced
-    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
-    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
-    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
-    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
-struct BlockReduceWarpReductions
-{
-    /// Constants
-    enum
-    {
-        /// The thread block size in threads
-        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-
-        /// Number of warp threads
-        WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
-
-        /// Number of active warps
-        WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
-
-        /// The logical warp size for warp reductions
-        LOGICAL_WARP_SIZE = CUB_MIN(BLOCK_THREADS, WARP_THREADS),
-
-        /// Whether or not the logical warp size evenly divides the thread block size
-        EVEN_WARP_MULTIPLE = (BLOCK_THREADS % LOGICAL_WARP_SIZE == 0)
-    };
-
-
-    ///  WarpReduce utility type
-    typedef typename WarpReduce<T, LOGICAL_WARP_SIZE, PTX_ARCH>::InternalWarpReduce WarpReduce;
-
-
-    /// Shared memory storage layout type
-    struct _TempStorage
-    {
-        typename WarpReduce::TempStorage    warp_reduce[WARPS];                ///< Buffer for warp-synchronous scan
-        T                                   warp_aggregates[WARPS];     ///< Shared totals from each warp-synchronous scan
-        T                                   block_prefix;               ///< Shared prefix for the entire thread block
-    };
-
-    /// Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    // Thread fields
-    _TempStorage &temp_storage;
-    unsigned int linear_tid;
-    unsigned int warp_id;
-    unsigned int lane_id;
-
-
-    /// Constructor
-    __device__ __forceinline__ BlockReduceWarpReductions(
-        TempStorage &temp_storage)
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
-        warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
-        lane_id(LaneId())
-    {}
-
-
-    template <bool FULL_TILE, typename ReductionOp, int SUCCESSOR_WARP>
-    __device__ __forceinline__ T ApplyWarpAggregates(
-        ReductionOp                 reduction_op,       ///< [in] Binary scan operator
-        T                           warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
-        int                         num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
-        Int2Type<SUCCESSOR_WARP>    /*successor_warp*/)
-    {
-        if (FULL_TILE || (SUCCESSOR_WARP * LOGICAL_WARP_SIZE < num_valid))
-        {
-            T addend = temp_storage.warp_aggregates[SUCCESSOR_WARP];
-            warp_aggregate = reduction_op(warp_aggregate, addend);
-        }
-        return ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid, Int2Type<SUCCESSOR_WARP + 1>());
-    }
-
-    template <bool FULL_TILE, typename ReductionOp>
-    __device__ __forceinline__ T ApplyWarpAggregates(
-        ReductionOp         /*reduction_op*/,   ///< [in] Binary scan operator
-        T                   warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
-        int                 /*num_valid*/,      ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
-        Int2Type<WARPS>     /*successor_warp*/)
-    {
-        return warp_aggregate;
-    }
-
-
-    /// Returns block-wide aggregate in <em>thread</em><sub>0</sub>.
-    template <
-        bool                FULL_TILE,
-        typename            ReductionOp>
-    __device__ __forceinline__ T ApplyWarpAggregates(
-        ReductionOp         reduction_op,       ///< [in] Binary scan operator
-        T                   warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
-        int                 num_valid)          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
-    {
-        // Share lane aggregates
-        if (lane_id == 0)
-        {
-            temp_storage.warp_aggregates[warp_id] = warp_aggregate;
-        }
-
-        CTA_SYNC();
-
-        // Update total aggregate in warp 0, lane 0
-        if (linear_tid == 0)
-        {
-            warp_aggregate = ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid, Int2Type<1>());
-        }
-
-        return warp_aggregate;
-    }
-
-
-    /// Computes a thread block-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
-    template <bool FULL_TILE>
-    __device__ __forceinline__ T Sum(
-        T                   input,          ///< [in] Calling thread's input partial reductions
-        int                 num_valid)      ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
-    {
-        cub::Sum        reduction_op;
-        unsigned int    warp_offset = warp_id * LOGICAL_WARP_SIZE;
-        unsigned int    warp_num_valid = (FULL_TILE && EVEN_WARP_MULTIPLE) ?
-                            LOGICAL_WARP_SIZE :
-                            (warp_offset < num_valid) ?
-                                num_valid - warp_offset :
-                                0;
-
-        // Warp reduction in every warp
-        T warp_aggregate = WarpReduce(temp_storage.warp_reduce[warp_id]).template Reduce<(FULL_TILE && EVEN_WARP_MULTIPLE), 1>(
-            input,
-            warp_num_valid,
-            cub::Sum());
-
-        // Update outputs and block_aggregate with warp-wide aggregates from lane-0s
-        return ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid);
-    }
-
-
-    /// Computes a thread block-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
-    template <
-        bool                FULL_TILE,
-        typename            ReductionOp>
-    __device__ __forceinline__ T Reduce(
-        T                   input,              ///< [in] Calling thread's input partial reductions
-        int                 num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
-        ReductionOp         reduction_op)       ///< [in] Binary reduction operator
-    {
-        unsigned int    warp_offset = warp_id * LOGICAL_WARP_SIZE;
-        unsigned int    warp_num_valid = (FULL_TILE && EVEN_WARP_MULTIPLE) ?
-                            LOGICAL_WARP_SIZE :
-                            (warp_offset < static_cast<unsigned int>(num_valid)) ?
-                                num_valid - warp_offset :
-                                0;
-
-        // Warp reduction in every warp
-        T warp_aggregate = WarpReduce(temp_storage.warp_reduce[warp_id]).template Reduce<(FULL_TILE && EVEN_WARP_MULTIPLE), 1>(
-            input,
-            warp_num_valid,
-            reduction_op);
-
-        // Update outputs and block_aggregate with warp-wide aggregates from lane-0s
-        return ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid);
-    }
-
-};
-
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/block/specializations/block_scan_raking.cuh b/thrust/system/cuda/detail/cub/block/specializations/block_scan_raking.cuh
deleted file mode 100644
index 0d49d0693..000000000
--- a/thrust/system/cuda/detail/cub/block/specializations/block_scan_raking.cuh
+++ /dev/null
@@ -1,666 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-
-/**
- * \file
- * cub::BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA thread block.
- */
-
-#pragma once
-
-#include "../../util_ptx.cuh"
-#include "../../util_arch.cuh"
-#include "../../block/block_raking_layout.cuh"
-#include "../../thread/thread_reduce.cuh"
-#include "../../thread/thread_scan.cuh"
-#include "../../warp/warp_scan.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \brief BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA thread block.
- */
-template <
-    typename    T,              ///< Data type being scanned
-    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
-    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
-    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
-    bool        MEMOIZE,        ///< Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure
-    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
-struct BlockScanRaking
-{
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    /// Constants
-    enum
-    {
-        /// The thread block size in threads
-        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-    };
-
-    /// Layout type for padded thread block raking grid
-    typedef BlockRakingLayout<T, BLOCK_THREADS, PTX_ARCH> BlockRakingLayout;
-
-    /// Constants
-    enum
-    {
-        /// Number of raking threads
-        RAKING_THREADS = BlockRakingLayout::RAKING_THREADS,
-
-        /// Number of raking elements per warp synchronous raking thread
-        SEGMENT_LENGTH = BlockRakingLayout::SEGMENT_LENGTH,
-
-        /// Cooperative work can be entirely warp synchronous
-        WARP_SYNCHRONOUS = (BLOCK_THREADS == RAKING_THREADS),
-    };
-
-    ///  WarpScan utility type
-    typedef WarpScan<T, RAKING_THREADS, PTX_ARCH> WarpScan;
-
-    /// Shared memory storage layout type
-    struct _TempStorage
-    {
-        typename WarpScan::TempStorage              warp_scan;          ///< Buffer for warp-synchronous scan
-        typename BlockRakingLayout::TempStorage     raking_grid;        ///< Padded thread block raking grid
-        T                                           block_aggregate;    ///< Block aggregate
-    };
-
-
-    /// Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    //---------------------------------------------------------------------
-    // Per-thread fields
-    //---------------------------------------------------------------------
-
-    // Thread fields
-    _TempStorage    &temp_storage;
-    unsigned int    linear_tid;
-    T               cached_segment[SEGMENT_LENGTH];
-
-
-    //---------------------------------------------------------------------
-    // Utility methods
-    //---------------------------------------------------------------------
-
-    /// Templated reduction
-    template <int ITERATION, typename ScanOp>
-    __device__ __forceinline__ T GuardedReduce(
-        T*                  raking_ptr,         ///< [in] Input array
-        ScanOp              scan_op,            ///< [in] Binary reduction operator
-        T                   raking_partial,     ///< [in] Prefix to seed reduction with
-        Int2Type<ITERATION> /*iteration*/)
-    {
-        if ((BlockRakingLayout::UNGUARDED) || (((linear_tid * SEGMENT_LENGTH) + ITERATION) < BLOCK_THREADS))
-        {
-            T addend = raking_ptr[ITERATION];
-            raking_partial = scan_op(raking_partial, addend);
-        }
-
-        return GuardedReduce(raking_ptr, scan_op, raking_partial, Int2Type<ITERATION + 1>());
-    }
-
-
-    /// Templated reduction (base case)
-    template <typename ScanOp>
-    __device__ __forceinline__ T GuardedReduce(
-        T*                          /*raking_ptr*/,    ///< [in] Input array
-        ScanOp                      /*scan_op*/,       ///< [in] Binary reduction operator
-        T                           raking_partial,    ///< [in] Prefix to seed reduction with
-        Int2Type<SEGMENT_LENGTH>    /*iteration*/)
-    {
-        return raking_partial;
-    }
-
-
-    /// Templated copy
-    template <int ITERATION>
-    __device__ __forceinline__ void CopySegment(
-        T*                  out,            ///< [out] Out array
-        T*                  in,             ///< [in] Input array
-        Int2Type<ITERATION> /*iteration*/)
-    {
-        out[ITERATION] = in[ITERATION];
-        CopySegment(out, in, Int2Type<ITERATION + 1>());
-    }
-
- 
-    /// Templated copy (base case)
-    __device__ __forceinline__ void CopySegment(
-        T*                  /*out*/,            ///< [out] Out array
-        T*                  /*in*/,             ///< [in] Input array
-        Int2Type<SEGMENT_LENGTH> /*iteration*/)
-    {}
-
-
-    /// Performs upsweep raking reduction, returning the aggregate
-    template <typename ScanOp>
-    __device__ __forceinline__ T Upsweep(
-        ScanOp scan_op)
-    {
-        T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
-
-        // Read data into registers
-        CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>());
-
-        T raking_partial = cached_segment[0];
-
-        return GuardedReduce(cached_segment, scan_op, raking_partial, Int2Type<1>());
-    }
-
-
-    /// Performs exclusive downsweep raking scan
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveDownsweep(
-        ScanOp          scan_op,
-        T               raking_partial,
-        bool            apply_prefix = true)
-    {
-        T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
-
-        // Read data back into registers
-        if (!MEMOIZE)
-        {
-            CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>());
-        }
-
-        internal::ThreadScanExclusive(cached_segment, cached_segment, scan_op, raking_partial, apply_prefix);
-
-        // Write data back to smem
-        CopySegment(smem_raking_ptr, cached_segment, Int2Type<0>());
-    }
-
-
-    /// Performs inclusive downsweep raking scan
-    template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveDownsweep(
-        ScanOp          scan_op,
-        T               raking_partial,
-        bool            apply_prefix = true)
-    {
-        T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
-
-        // Read data back into registers
-        if (!MEMOIZE)
-        {
-            CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>());
-        }
-
-        internal::ThreadScanInclusive(cached_segment, cached_segment, scan_op, raking_partial, apply_prefix);
-
-        // Write data back to smem
-        CopySegment(smem_raking_ptr, cached_segment, Int2Type<0>());
-    }
-
-
-    //---------------------------------------------------------------------
-    // Constructors
-    //---------------------------------------------------------------------
-
-    /// Constructor
-    __device__ __forceinline__ BlockScanRaking(
-        TempStorage &temp_storage)
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    //---------------------------------------------------------------------
-    // Exclusive scans
-    //---------------------------------------------------------------------
-
-    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &exclusive_output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op)                        ///< [in] Binary scan operator
-    {
-        if (WARP_SYNCHRONOUS)
-        {
-            // Short-circuit directly to warp-synchronous scan
-            WarpScan(temp_storage.warp_scan).ExclusiveScan(input, exclusive_output, scan_op);
-        }
-        else
-        {
-            // Place thread partial into shared memory raking grid
-            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
-            *placement_ptr = input;
-
-            CTA_SYNC();
-
-            // Reduce parallelism down to just raking threads
-            if (linear_tid < RAKING_THREADS)
-            {
-                // Raking upsweep reduction across shared partials
-                T upsweep_partial = Upsweep(scan_op);
-
-                // Warp-synchronous scan
-                T exclusive_partial;
-                WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, scan_op);
-
-                // Exclusive raking downsweep scan
-                ExclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0));
-            }
-
-            CTA_SYNC();
-
-            // Grab thread prefix from shared memory
-            exclusive_output = *placement_ptr;
-        }
-    }
-
-    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input items
-        T               &output,            ///< [out] Calling thread's output items (may be aliased to \p input)
-        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
-        ScanOp          scan_op)            ///< [in] Binary scan operator
-    {
-        if (WARP_SYNCHRONOUS)
-        {
-            // Short-circuit directly to warp-synchronous scan
-            WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, initial_value, scan_op);
-        }
-        else
-        {
-            // Place thread partial into shared memory raking grid
-            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
-            *placement_ptr = input;
-
-            CTA_SYNC();
-
-            // Reduce parallelism down to just raking threads
-            if (linear_tid < RAKING_THREADS)
-            {
-                // Raking upsweep reduction across shared partials
-                T upsweep_partial = Upsweep(scan_op);
-
-                // Exclusive Warp-synchronous scan
-                T exclusive_partial;
-                WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, initial_value, scan_op);
-
-                // Exclusive raking downsweep scan
-                ExclusiveDownsweep(scan_op, exclusive_partial);
-            }
-
-            CTA_SYNC();
-
-            // Grab exclusive partial from shared memory
-            output = *placement_ptr;
-        }
-    }
-
-
-    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op,                        ///< [in] Binary scan operator
-        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
-    {
-        if (WARP_SYNCHRONOUS)
-        {
-            // Short-circuit directly to warp-synchronous scan
-            WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, scan_op, block_aggregate);
-        }
-        else
-        {
-            // Place thread partial into shared memory raking grid
-            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
-            *placement_ptr = input;
-
-            CTA_SYNC();
-
-            // Reduce parallelism down to just raking threads
-            if (linear_tid < RAKING_THREADS)
-            {
-                // Raking upsweep reduction across shared partials
-                T upsweep_partial= Upsweep(scan_op);
-
-                // Warp-synchronous scan
-                T inclusive_partial;
-                T exclusive_partial;
-                WarpScan(temp_storage.warp_scan).Scan(upsweep_partial, inclusive_partial, exclusive_partial, scan_op);
-
-                // Exclusive raking downsweep scan
-                ExclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0));
-
-                // Broadcast aggregate to all threads
-                if (linear_tid == RAKING_THREADS - 1)
-                    temp_storage.block_aggregate = inclusive_partial;
-            }
-
-            CTA_SYNC();
-
-            // Grab thread prefix from shared memory
-            output = *placement_ptr;
-
-            // Retrieve block aggregate
-            block_aggregate = temp_storage.block_aggregate;
-        }
-    }
-
-
-    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input items
-        T               &output,            ///< [out] Calling thread's output items (may be aliased to \p input)
-        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
-    {
-        if (WARP_SYNCHRONOUS)
-        {
-            // Short-circuit directly to warp-synchronous scan
-            WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, initial_value, scan_op, block_aggregate);
-        }
-        else
-        {
-            // Place thread partial into shared memory raking grid
-            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
-            *placement_ptr = input;
-
-            CTA_SYNC();
-
-            // Reduce parallelism down to just raking threads
-            if (linear_tid < RAKING_THREADS)
-            {
-                // Raking upsweep reduction across shared partials
-                T upsweep_partial = Upsweep(scan_op);
-
-                // Warp-synchronous scan
-                T exclusive_partial;
-                WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, initial_value, scan_op, block_aggregate);
-
-                // Exclusive raking downsweep scan
-                ExclusiveDownsweep(scan_op, exclusive_partial);
-
-                // Broadcast aggregate to other threads
-                if (linear_tid == 0)
-                    temp_storage.block_aggregate = block_aggregate;
-            }
-
-            CTA_SYNC();
-
-            // Grab exclusive partial from shared memory
-            output = *placement_ptr;
-
-            // Retrieve block aggregate
-            block_aggregate = temp_storage.block_aggregate;
-        }
-    }
-
-
-    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-    template <
-        typename ScanOp,
-        typename BlockPrefixCallbackOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T                       input,                          ///< [in] Calling thread's input item
-        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp                  scan_op,                        ///< [in] Binary scan operator
-        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
-    {
-        if (WARP_SYNCHRONOUS)
-        {
-            // Short-circuit directly to warp-synchronous scan
-            T block_aggregate;
-            WarpScan warp_scan(temp_storage.warp_scan);
-            warp_scan.ExclusiveScan(input, output, scan_op, block_aggregate);
-
-            // Obtain warp-wide prefix in lane0, then broadcast to other lanes
-            T block_prefix = block_prefix_callback_op(block_aggregate);
-            block_prefix = warp_scan.Broadcast(block_prefix, 0);
-
-            output = scan_op(block_prefix, output);
-            if (linear_tid == 0)
-                output = block_prefix;
-        }
-        else
-        {
-            // Place thread partial into shared memory raking grid
-            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
-            *placement_ptr = input;
-
-            CTA_SYNC();
-
-            // Reduce parallelism down to just raking threads
-            if (linear_tid < RAKING_THREADS)
-            {
-                WarpScan warp_scan(temp_storage.warp_scan);
-
-                // Raking upsweep reduction across shared partials
-                T upsweep_partial = Upsweep(scan_op);
-
-                // Warp-synchronous scan
-                T exclusive_partial, block_aggregate;
-                warp_scan.ExclusiveScan(upsweep_partial, exclusive_partial, scan_op, block_aggregate);
-
-                // Obtain block-wide prefix in lane0, then broadcast to other lanes
-                T block_prefix = block_prefix_callback_op(block_aggregate);
-                block_prefix = warp_scan.Broadcast(block_prefix, 0);
-
-                // Update prefix with warpscan exclusive partial
-                T downsweep_prefix = scan_op(block_prefix, exclusive_partial);
-                if (linear_tid == 0)
-                    downsweep_prefix = block_prefix;
-
-                // Exclusive raking downsweep scan
-                ExclusiveDownsweep(scan_op, downsweep_prefix);
-            }
-
-            CTA_SYNC();
-
-            // Grab thread prefix from shared memory
-            output = *placement_ptr;
-        }
-    }
-
-
-    //---------------------------------------------------------------------
-    // Inclusive scans
-    //---------------------------------------------------------------------
-
-    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
-    template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op)                        ///< [in] Binary scan operator
-    {
-        if (WARP_SYNCHRONOUS)
-        {
-            // Short-circuit directly to warp-synchronous scan
-            WarpScan(temp_storage.warp_scan).InclusiveScan(input, output, scan_op);
-        }
-        else
-        {
-            // Place thread partial into shared memory raking grid
-            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
-            *placement_ptr = input;
-
-            CTA_SYNC();
-
-            // Reduce parallelism down to just raking threads
-            if (linear_tid < RAKING_THREADS)
-            {
-                // Raking upsweep reduction across shared partials
-                T upsweep_partial = Upsweep(scan_op);
-
-                // Exclusive Warp-synchronous scan
-                T exclusive_partial;
-                WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, scan_op);
-
-                // Inclusive raking downsweep scan
-                InclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0));
-            }
-
-            CTA_SYNC();
-
-            // Grab thread prefix from shared memory
-            output = *placement_ptr;
-        }
-    }
-
-
-    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-    template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op,                        ///< [in] Binary scan operator
-        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
-    {
-        if (WARP_SYNCHRONOUS)
-        {
-            // Short-circuit directly to warp-synchronous scan
-            WarpScan(temp_storage.warp_scan).InclusiveScan(input, output, scan_op, block_aggregate);
-        }
-        else
-        {
-            // Place thread partial into shared memory raking grid
-            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
-            *placement_ptr = input;
-
-            CTA_SYNC();
-
-            // Reduce parallelism down to just raking threads
-            if (linear_tid < RAKING_THREADS)
-            {
-                // Raking upsweep reduction across shared partials
-                T upsweep_partial = Upsweep(scan_op);
-
-                // Warp-synchronous scan
-                T inclusive_partial;
-                T exclusive_partial;
-                WarpScan(temp_storage.warp_scan).Scan(upsweep_partial, inclusive_partial, exclusive_partial, scan_op);
-
-                // Inclusive raking downsweep scan
-                InclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0));
-
-                // Broadcast aggregate to all threads
-                if (linear_tid == RAKING_THREADS - 1)
-                    temp_storage.block_aggregate = inclusive_partial;
-            }
-
-            CTA_SYNC();
-
-            // Grab thread prefix from shared memory
-            output = *placement_ptr;
-
-            // Retrieve block aggregate
-            block_aggregate = temp_storage.block_aggregate;
-        }
-    }
-
-
-    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-    template <
-        typename ScanOp,
-        typename BlockPrefixCallbackOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T                       input,                          ///< [in] Calling thread's input item
-        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp                  scan_op,                        ///< [in] Binary scan operator
-        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
-    {
-        if (WARP_SYNCHRONOUS)
-        {
-            // Short-circuit directly to warp-synchronous scan
-            T block_aggregate;
-            WarpScan warp_scan(temp_storage.warp_scan);
-            warp_scan.InclusiveScan(input, output, scan_op, block_aggregate);
-
-            // Obtain warp-wide prefix in lane0, then broadcast to other lanes
-            T block_prefix = block_prefix_callback_op(block_aggregate);
-            block_prefix = warp_scan.Broadcast(block_prefix, 0);
-
-            // Update prefix with exclusive warpscan partial
-            output = scan_op(block_prefix, output);
-        }
-        else
-        {
-            // Place thread partial into shared memory raking grid
-            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
-            *placement_ptr = input;
-
-            CTA_SYNC();
-
-            // Reduce parallelism down to just raking threads
-            if (linear_tid < RAKING_THREADS)
-            {
-                WarpScan warp_scan(temp_storage.warp_scan);
-
-                // Raking upsweep reduction across shared partials
-                T upsweep_partial = Upsweep(scan_op);
-
-                // Warp-synchronous scan
-                T exclusive_partial, block_aggregate;
-                warp_scan.ExclusiveScan(upsweep_partial, exclusive_partial, scan_op, block_aggregate);
-
-                // Obtain block-wide prefix in lane0, then broadcast to other lanes
-                T block_prefix = block_prefix_callback_op(block_aggregate);
-                block_prefix = warp_scan.Broadcast(block_prefix, 0);
-
-                // Update prefix with warpscan exclusive partial
-                T downsweep_prefix = scan_op(block_prefix, exclusive_partial);
-                if (linear_tid == 0)
-                    downsweep_prefix = block_prefix;
-
-                // Inclusive raking downsweep scan
-                InclusiveDownsweep(scan_op, downsweep_prefix);
-            }
-
-            CTA_SYNC();
-
-            // Grab thread prefix from shared memory
-            output = *placement_ptr;
-        }
-    }
-
-};
-
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans.cuh b/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans.cuh
deleted file mode 100644
index 6f582a8e4..000000000
--- a/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans.cuh
+++ /dev/null
@@ -1,392 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::BlockScanWarpscans provides warpscan-based variants of parallel prefix scan across a CUDA thread block.
- */
-
-#pragma once
-
-#include "../../util_arch.cuh"
-#include "../../util_ptx.cuh"
-#include "../../warp/warp_scan.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA thread block.
- */
-template <
-    typename    T,
-    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
-    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
-    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
-    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
-struct BlockScanWarpScans
-{
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    /// Constants
-    enum
-    {
-        /// Number of warp threads
-        WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
-
-        /// The thread block size in threads
-        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-
-        /// Number of active warps
-        WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
-    };
-
-    ///  WarpScan utility type
-    typedef WarpScan<T, WARP_THREADS, PTX_ARCH> WarpScanT;
-
-    ///  WarpScan utility type
-    typedef WarpScan<T, WARPS, PTX_ARCH> WarpAggregateScan;
-
-    /// Shared memory storage layout type
-
-    struct __align__(32) _TempStorage
-    {
-        T                               warp_aggregates[WARPS];
-        typename WarpScanT::TempStorage warp_scan[WARPS];           ///< Buffer for warp-synchronous scans
-        T                               block_prefix;               ///< Shared prefix for the entire thread block
-    };
-
-
-    /// Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    //---------------------------------------------------------------------
-    // Per-thread fields
-    //---------------------------------------------------------------------
-
-    // Thread fields
-    _TempStorage    &temp_storage;
-    unsigned int    linear_tid;
-    unsigned int    warp_id;
-    unsigned int    lane_id;
-
-
-    //---------------------------------------------------------------------
-    // Constructors
-    //---------------------------------------------------------------------
-
-    /// Constructor
-    __device__ __forceinline__ BlockScanWarpScans(
-        TempStorage &temp_storage)
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
-        warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
-        lane_id(LaneId())
-    {}
-
-
-    //---------------------------------------------------------------------
-    // Utility methods
-    //---------------------------------------------------------------------
-
-    template <typename ScanOp, int WARP>
-    __device__ __forceinline__ void ApplyWarpAggregates(
-        T               &warp_prefix,           ///< [out] The calling thread's partial reduction
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &block_aggregate,   ///< [out] Threadblock-wide aggregate reduction of input items
-        Int2Type<WARP>  /*addend_warp*/)
-    {
-        if (warp_id == WARP)
-            warp_prefix = block_aggregate;
-
-        T addend = temp_storage.warp_aggregates[WARP];
-        block_aggregate = scan_op(block_aggregate, addend);
-
-        ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type<WARP + 1>());
-    }
-
-    template <typename ScanOp>
-    __device__ __forceinline__ void ApplyWarpAggregates(
-        T               &/*warp_prefix*/,       ///< [out] The calling thread's partial reduction
-        ScanOp          /*scan_op*/,            ///< [in] Binary scan operator
-        T               &/*block_aggregate*/,   ///< [out] Threadblock-wide aggregate reduction of input items
-        Int2Type<WARPS> /*addend_warp*/)
-    {}
-
-
-    /// Use the warp-wide aggregates to compute the calling warp's prefix.  Also returns block-wide aggregate in all threads.
-    template <typename ScanOp>
-    __device__ __forceinline__ T ComputeWarpPrefix(
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>WARP_THREADS - 1</sub> only]</b> Warp-wide aggregate reduction of input items
-        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
-    {
-        // Last lane in each warp shares its warp-aggregate
-        if (lane_id == WARP_THREADS - 1)
-            temp_storage.warp_aggregates[warp_id] = warp_aggregate;
-
-        CTA_SYNC();
-
-        // Accumulate block aggregates and save the one that is our warp's prefix
-        T warp_prefix;
-        block_aggregate = temp_storage.warp_aggregates[0];
-
-        // Use template unrolling (since the PTX backend can't handle unrolling it for SM1x)
-        ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type<1>());
-/*
-        #pragma unroll
-        for (int WARP = 1; WARP < WARPS; ++WARP)
-        {
-            if (warp_id == WARP)
-                warp_prefix = block_aggregate;
-
-            T addend = temp_storage.warp_aggregates[WARP];
-            block_aggregate = scan_op(block_aggregate, addend);
-        }
-*/
-
-        return warp_prefix;
-    }
-
-
-    /// Use the warp-wide aggregates and initial-value to compute the calling warp's prefix.  Also returns block-wide aggregate in all threads.
-    template <typename ScanOp>
-    __device__ __forceinline__ T ComputeWarpPrefix(
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>WARP_THREADS - 1</sub> only]</b> Warp-wide aggregate reduction of input items
-        T               &block_aggregate,   ///< [out] Threadblock-wide aggregate reduction of input items
-        const T         &initial_value)     ///< [in] Initial value to seed the exclusive scan
-    {
-        T warp_prefix = ComputeWarpPrefix(scan_op, warp_aggregate, block_aggregate);
-
-        warp_prefix = scan_op(initial_value, warp_prefix);
-
-        if (warp_id == 0)
-            warp_prefix = initial_value;
-
-        return warp_prefix;
-    }
-
-    //---------------------------------------------------------------------
-    // Exclusive scans
-    //---------------------------------------------------------------------
-
-    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op)                        ///< [in] Binary scan operator
-    {
-        // Compute block-wide exclusive scan.  The exclusive output from tid0 is invalid.
-        T block_aggregate;
-        ExclusiveScan(input, exclusive_output, scan_op, block_aggregate);
-    }
-
-
-    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input items
-        T               &exclusive_output,  ///< [out] Calling thread's output items (may be aliased to \p input)
-        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
-        ScanOp          scan_op)            ///< [in] Binary scan operator
-    {
-        T block_aggregate;
-        ExclusiveScan(input, exclusive_output, initial_value, scan_op, block_aggregate);
-    }
-
-
-    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input item
-        T               &exclusive_output,  ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
-    {
-        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
-        T inclusive_output;
-        WarpScanT(temp_storage.warp_scan[warp_id]).Scan(input, inclusive_output, exclusive_output, scan_op);
-
-        // Compute the warp-wide prefix and block-wide aggregate for each warp.  Warp prefix for warp0 is invalid.
-        T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate);
-
-        // Apply warp prefix to our lane's partial
-        if (warp_id != 0)
-        {
-            exclusive_output = scan_op(warp_prefix, exclusive_output);
-            if (lane_id == 0)
-                exclusive_output = warp_prefix;
-        }
-    }
-
-
-    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input items
-        T               &exclusive_output,  ///< [out] Calling thread's output items (may be aliased to \p input)
-        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
-    {
-        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
-        T inclusive_output;
-        WarpScanT(temp_storage.warp_scan[warp_id]).Scan(input, inclusive_output, exclusive_output, scan_op);
-
-        // Compute the warp-wide prefix and block-wide aggregate for each warp
-        T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate, initial_value);
-
-        // Apply warp prefix to our lane's partial
-        exclusive_output = scan_op(warp_prefix, exclusive_output);
-        if (lane_id == 0)
-            exclusive_output = warp_prefix;
-    }
-
-
-    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-    template <
-        typename ScanOp,
-        typename BlockPrefixCallbackOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T                       input,                          ///< [in] Calling thread's input item
-        T                       &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp                  scan_op,                        ///< [in] Binary scan operator
-        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
-    {
-        // Compute block-wide exclusive scan.  The exclusive output from tid0 is invalid.
-        T block_aggregate;
-        ExclusiveScan(input, exclusive_output, scan_op, block_aggregate);
-
-        // Use the first warp to determine the thread block prefix, returning the result in lane0
-        if (warp_id == 0)
-        {
-            T block_prefix = block_prefix_callback_op(block_aggregate);
-            if (lane_id == 0)
-            {
-                // Share the prefix with all threads
-                temp_storage.block_prefix = block_prefix;
-                exclusive_output = block_prefix;                // The block prefix is the exclusive output for tid0
-            }
-        }
-
-        CTA_SYNC();
-
-        // Incorporate thread block prefix into outputs
-        T block_prefix = temp_storage.block_prefix;
-        if (linear_tid > 0)
-        {
-            exclusive_output = scan_op(block_prefix, exclusive_output);
-        }
-    }
-
-
-    //---------------------------------------------------------------------
-    // Inclusive scans
-    //---------------------------------------------------------------------
-
-    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
-    template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &inclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op)                        ///< [in] Binary scan operator
-    {
-        T block_aggregate;
-        InclusiveScan(input, inclusive_output, scan_op, block_aggregate);
-    }
-
-
-    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-    template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &inclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op,                        ///< [in] Binary scan operator
-        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
-    {
-        WarpScanT(temp_storage.warp_scan[warp_id]).InclusiveScan(input, inclusive_output, scan_op);
-
-        // Compute the warp-wide prefix and block-wide aggregate for each warp.  Warp prefix for warp0 is invalid.
-        T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate);
-
-        // Apply warp prefix to our lane's partial
-        if (warp_id != 0)
-        {
-            inclusive_output = scan_op(warp_prefix, inclusive_output);
-        }
-    }
-
-
-    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-    template <
-        typename ScanOp,
-        typename BlockPrefixCallbackOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T                       input,                          ///< [in] Calling thread's input item
-        T                       &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp                  scan_op,                        ///< [in] Binary scan operator
-        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
-    {
-        T block_aggregate;
-        InclusiveScan(input, exclusive_output, scan_op, block_aggregate);
-
-        // Use the first warp to determine the thread block prefix, returning the result in lane0
-        if (warp_id == 0)
-        {
-            T block_prefix = block_prefix_callback_op(block_aggregate);
-            if (lane_id == 0)
-            {
-                // Share the prefix with all threads
-                temp_storage.block_prefix = block_prefix;
-            }
-        }
-
-        CTA_SYNC();
-
-        // Incorporate thread block prefix into outputs
-        T block_prefix = temp_storage.block_prefix;
-        exclusive_output = scan_op(block_prefix, exclusive_output);
-    }
-
-
-};
-
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans2.cuh b/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans2.cuh
deleted file mode 100644
index 2be0e749c..000000000
--- a/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans2.cuh
+++ /dev/null
@@ -1,436 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::BlockScanWarpscans provides warpscan-based variants of parallel prefix scan across a CUDA thread block.
- */
-
-#pragma once
-
-#include "../../util_arch.cuh"
-#include "../../util_ptx.cuh"
-#include "../../warp/warp_scan.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA thread block.
- */
-template <
-    typename    T,
-    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
-    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
-    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
-    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
-struct BlockScanWarpScans
-{
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    /// Constants
-    enum
-    {
-        /// Number of warp threads
-        WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
-
-        /// The thread block size in threads
-        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-
-        /// Number of active warps
-        WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
-    };
-
-    ///  WarpScan utility type
-    typedef WarpScan<T, WARP_THREADS, PTX_ARCH> WarpScanT;
-
-    ///  WarpScan utility type
-    typedef WarpScan<T, WARPS, PTX_ARCH> WarpAggregateScanT;
-
-    /// Shared memory storage layout type
-    struct _TempStorage
-    {
-        typename WarpAggregateScanT::TempStorage    inner_scan[WARPS];          ///< Buffer for warp-synchronous scans
-        typename WarpScanT::TempStorage             warp_scan[WARPS];           ///< Buffer for warp-synchronous scans
-        T                                           warp_aggregates[WARPS];
-        T                                           block_prefix;               ///< Shared prefix for the entire thread block
-    };
-
-
-    /// Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    //---------------------------------------------------------------------
-    // Per-thread fields
-    //---------------------------------------------------------------------
-
-    // Thread fields
-    _TempStorage    &temp_storage;
-    unsigned int    linear_tid;
-    unsigned int    warp_id;
-    unsigned int    lane_id;
-
-
-    //---------------------------------------------------------------------
-    // Constructors
-    //---------------------------------------------------------------------
-
-    /// Constructor
-    __device__ __forceinline__ BlockScanWarpScans(
-        TempStorage &temp_storage)
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
-        warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
-        lane_id(LaneId())
-    {}
-
-
-    //---------------------------------------------------------------------
-    // Utility methods
-    //---------------------------------------------------------------------
-
-    template <typename ScanOp, int WARP>
-    __device__ __forceinline__ void ApplyWarpAggregates(
-        T               &warp_prefix,           ///< [out] The calling thread's partial reduction
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &block_aggregate,   ///< [out] Threadblock-wide aggregate reduction of input items
-        Int2Type<WARP>  addend_warp)
-    {
-        if (warp_id == WARP)
-            warp_prefix = block_aggregate;
-
-        T addend = temp_storage.warp_aggregates[WARP];
-        block_aggregate = scan_op(block_aggregate, addend);
-
-        ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type<WARP + 1>());
-    }
-
-    template <typename ScanOp>
-    __device__ __forceinline__ void ApplyWarpAggregates(
-        T               &warp_prefix,           ///< [out] The calling thread's partial reduction
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &block_aggregate,   ///< [out] Threadblock-wide aggregate reduction of input items
-        Int2Type<WARPS> addend_warp)
-    {}
-
-
-    /// Use the warp-wide aggregates to compute the calling warp's prefix.  Also returns block-wide aggregate in all threads.
-    template <typename ScanOp>
-    __device__ __forceinline__ T ComputeWarpPrefix(
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>WARP_THREADS - 1</sub> only]</b> Warp-wide aggregate reduction of input items
-        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
-    {
-        // Last lane in each warp shares its warp-aggregate
-        if (lane_id == WARP_THREADS - 1)
-            temp_storage.warp_aggregates[warp_id] = warp_aggregate;
-
-        CTA_SYNC();
-
-        // Accumulate block aggregates and save the one that is our warp's prefix
-        T warp_prefix;
-        block_aggregate = temp_storage.warp_aggregates[0];
-
-        // Use template unrolling (since the PTX backend can't handle unrolling it for SM1x)
-        ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type<1>());
-/*
-        #pragma unroll
-        for (int WARP = 1; WARP < WARPS; ++WARP)
-        {
-            if (warp_id == WARP)
-                warp_prefix = block_aggregate;
-
-            T addend = temp_storage.warp_aggregates[WARP];
-            block_aggregate = scan_op(block_aggregate, addend);
-        }
-*/
-
-        return warp_prefix;
-    }
-
-
-    /// Use the warp-wide aggregates and initial-value to compute the calling warp's prefix.  Also returns block-wide aggregate in all threads.
-    template <typename ScanOp>
-    __device__ __forceinline__ T ComputeWarpPrefix(
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>WARP_THREADS - 1</sub> only]</b> Warp-wide aggregate reduction of input items
-        T               &block_aggregate,   ///< [out] Threadblock-wide aggregate reduction of input items
-        const T         &initial_value)     ///< [in] Initial value to seed the exclusive scan
-    {
-        T warp_prefix = ComputeWarpPrefix(scan_op, warp_aggregate, block_aggregate);
-
-        warp_prefix = scan_op(initial_value, warp_prefix);
-
-        if (warp_id == 0)
-            warp_prefix = initial_value;
-
-        return warp_prefix;
-    }
-
-    //---------------------------------------------------------------------
-    // Exclusive scans
-    //---------------------------------------------------------------------
-
-    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op)                        ///< [in] Binary scan operator
-    {
-        // Compute block-wide exclusive scan.  The exclusive output from tid0 is invalid.
-        T block_aggregate;
-        ExclusiveScan(input, exclusive_output, scan_op, block_aggregate);
-    }
-
-
-    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input items
-        T               &exclusive_output,  ///< [out] Calling thread's output items (may be aliased to \p input)
-        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
-        ScanOp          scan_op)            ///< [in] Binary scan operator
-    {
-        T block_aggregate;
-        ExclusiveScan(input, exclusive_output, initial_value, scan_op, block_aggregate);
-    }
-
-
-    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input item
-        T               &exclusive_output,  ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
-    {
-        WarpScanT my_warp_scan(temp_storage.warp_scan[warp_id]);
-
-        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
-        T inclusive_output;
-        my_warp_scan.Scan(input, inclusive_output, exclusive_output, scan_op);
-
-        // Compute the warp-wide prefix and block-wide aggregate for each warp.  Warp prefix for warp0 is invalid.
-//        T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate);
-
-//--------------------------------------------------
-        // Last lane in each warp shares its warp-aggregate
-        if (lane_id == WARP_THREADS - 1)
-            temp_storage.warp_aggregates[warp_id] = inclusive_output;
-
-        CTA_SYNC();
-
-        // Get the warp scan partial
-        T warp_inclusive, warp_prefix;
-        if (lane_id < WARPS)
-        {
-            // Scan the warpscan partials
-            T warp_val = temp_storage.warp_aggregates[lane_id];
-            WarpAggregateScanT(temp_storage.inner_scan[warp_id]).Scan(warp_val, warp_inclusive, warp_prefix, scan_op);
-        }
-
-        warp_prefix         = my_warp_scan.Broadcast(warp_prefix, warp_id);
-        block_aggregate     = my_warp_scan.Broadcast(warp_inclusive, WARPS - 1);
-//--------------------------------------------------
-
-        // Apply warp prefix to our lane's partial
-        if (warp_id != 0)
-        {
-            exclusive_output = scan_op(warp_prefix, exclusive_output);
-            if (lane_id == 0)
-                exclusive_output = warp_prefix;
-        }
-    }
-
-
-    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input items
-        T               &exclusive_output,  ///< [out] Calling thread's output items (may be aliased to \p input)
-        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
-    {
-        WarpScanT my_warp_scan(temp_storage.warp_scan[warp_id]);
-
-        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
-        T inclusive_output;
-        my_warp_scan.Scan(input, inclusive_output, exclusive_output, scan_op);
-
-        // Compute the warp-wide prefix and block-wide aggregate for each warp
-//        T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate, initial_value);
-
-//--------------------------------------------------
-        // Last lane in each warp shares its warp-aggregate
-        if (lane_id == WARP_THREADS - 1)
-            temp_storage.warp_aggregates[warp_id] = inclusive_output;
-
-        CTA_SYNC();
-
-        // Get the warp scan partial
-        T warp_inclusive, warp_prefix;
-        if (lane_id < WARPS)
-        {
-            // Scan the warpscan partials
-            T warp_val = temp_storage.warp_aggregates[lane_id];
-            WarpAggregateScanT(temp_storage.inner_scan[warp_id]).Scan(warp_val, warp_inclusive, warp_prefix, initial_value, scan_op);
-        }
-
-        warp_prefix         = my_warp_scan.Broadcast(warp_prefix, warp_id);
-        block_aggregate     = my_warp_scan.Broadcast(warp_inclusive, WARPS - 1);
-//--------------------------------------------------
-
-        // Apply warp prefix to our lane's partial
-        exclusive_output = scan_op(warp_prefix, exclusive_output);
-        if (lane_id == 0)
-            exclusive_output = warp_prefix;
-    }
-
-
-    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-    template <
-        typename ScanOp,
-        typename BlockPrefixCallbackOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T                       input,                          ///< [in] Calling thread's input item
-        T                       &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp                  scan_op,                        ///< [in] Binary scan operator
-        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
-    {
-        // Compute block-wide exclusive scan.  The exclusive output from tid0 is invalid.
-        T block_aggregate;
-        ExclusiveScan(input, exclusive_output, scan_op, block_aggregate);
-
-        // Use the first warp to determine the thread block prefix, returning the result in lane0
-        if (warp_id == 0)
-        {
-            T block_prefix = block_prefix_callback_op(block_aggregate);
-            if (lane_id == 0)
-            {
-                // Share the prefix with all threads
-                temp_storage.block_prefix = block_prefix;
-                exclusive_output = block_prefix;                // The block prefix is the exclusive output for tid0
-            }
-        }
-
-        CTA_SYNC();
-
-        // Incorporate thread block prefix into outputs
-        T block_prefix = temp_storage.block_prefix;
-        if (linear_tid > 0)
-        {
-            exclusive_output = scan_op(block_prefix, exclusive_output);
-        }
-    }
-
-
-    //---------------------------------------------------------------------
-    // Inclusive scans
-    //---------------------------------------------------------------------
-
-    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
-    template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &inclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op)                        ///< [in] Binary scan operator
-    {
-        T block_aggregate;
-        InclusiveScan(input, inclusive_output, scan_op, block_aggregate);
-    }
-
-
-    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-    template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &inclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op,                        ///< [in] Binary scan operator
-        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
-    {
-        WarpScanT(temp_storage.warp_scan[warp_id]).InclusiveScan(input, inclusive_output, scan_op);
-
-        // Compute the warp-wide prefix and block-wide aggregate for each warp.  Warp prefix for warp0 is invalid.
-        T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate);
-
-        // Apply warp prefix to our lane's partial
-        if (warp_id != 0)
-        {
-            inclusive_output = scan_op(warp_prefix, inclusive_output);
-        }
-    }
-
-
-    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-    template <
-        typename ScanOp,
-        typename BlockPrefixCallbackOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T                       input,                          ///< [in] Calling thread's input item
-        T                       &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp                  scan_op,                        ///< [in] Binary scan operator
-        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
-    {
-        T block_aggregate;
-        InclusiveScan(input, exclusive_output, scan_op, block_aggregate);
-
-        // Use the first warp to determine the thread block prefix, returning the result in lane0
-        if (warp_id == 0)
-        {
-            T block_prefix = block_prefix_callback_op(block_aggregate);
-            if (lane_id == 0)
-            {
-                // Share the prefix with all threads
-                temp_storage.block_prefix = block_prefix;
-            }
-        }
-
-        CTA_SYNC();
-
-        // Incorporate thread block prefix into outputs
-        T block_prefix = temp_storage.block_prefix;
-        exclusive_output = scan_op(block_prefix, exclusive_output);
-    }
-
-
-};
-
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans3.cuh b/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans3.cuh
deleted file mode 100644
index 15a9cf54b..000000000
--- a/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans3.cuh
+++ /dev/null
@@ -1,418 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::BlockScanWarpscans provides warpscan-based variants of parallel prefix scan across a CUDA thread block.
- */
-
-#pragma once
-
-#include "../../util_arch.cuh"
-#include "../../util_ptx.cuh"
-#include "../../warp/warp_scan.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA thread block.
- */
-template <
-    typename    T,
-    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
-    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
-    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
-    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
-struct BlockScanWarpScans
-{
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    /// Constants
-    enum
-    {
-        /// The thread block size in threads
-        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-
-        /// Number of warp threads
-        INNER_WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
-        OUTER_WARP_THREADS = BLOCK_THREADS / INNER_WARP_THREADS,
-
-        /// Number of outer scan warps
-        OUTER_WARPS = INNER_WARP_THREADS
-    };
-
-    ///  Outer WarpScan utility type
-    typedef WarpScan<T, OUTER_WARP_THREADS, PTX_ARCH> OuterWarpScanT;
-
-    ///  Inner WarpScan utility type
-    typedef WarpScan<T, INNER_WARP_THREADS, PTX_ARCH> InnerWarpScanT;
-
-    typedef typename OuterWarpScanT::TempStorage OuterScanArray[OUTER_WARPS];
-
-
-    /// Shared memory storage layout type
-    struct _TempStorage
-    {
-        union Aliasable
-        {
-            Uninitialized<OuterScanArray>           outer_warp_scan;  ///< Buffer for warp-synchronous outer scans
-            typename InnerWarpScanT::TempStorage    inner_warp_scan;  ///< Buffer for warp-synchronous inner scan
-
-        } aliasable;
-
-        T                               warp_aggregates[OUTER_WARPS];
-
-        T                               block_aggregate;                           ///< Shared prefix for the entire thread block
-    };
-
-
-    /// Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    //---------------------------------------------------------------------
-    // Per-thread fields
-    //---------------------------------------------------------------------
-
-    // Thread fields
-    _TempStorage    &temp_storage;
-    unsigned int    linear_tid;
-    unsigned int    warp_id;
-    unsigned int    lane_id;
-
-
-    //---------------------------------------------------------------------
-    // Constructors
-    //---------------------------------------------------------------------
-
-    /// Constructor
-    __device__ __forceinline__ BlockScanWarpScans(
-        TempStorage &temp_storage)
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
-        warp_id((OUTER_WARPS == 1) ? 0 : linear_tid / OUTER_WARP_THREADS),
-        lane_id((OUTER_WARPS == 1) ? linear_tid : linear_tid % OUTER_WARP_THREADS)
-    {}
-
-
-    //---------------------------------------------------------------------
-    // Exclusive scans
-    //---------------------------------------------------------------------
-
-    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op)                        ///< [in] Binary scan operator
-    {
-        // Compute block-wide exclusive scan.  The exclusive output from tid0 is invalid.
-        T block_aggregate;
-        ExclusiveScan(input, exclusive_output, scan_op, block_aggregate);
-    }
-
-
-    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input items
-        T               &exclusive_output,  ///< [out] Calling thread's output items (may be aliased to \p input)
-        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
-        ScanOp          scan_op)            ///< [in] Binary scan operator
-    {
-        T block_aggregate;
-        ExclusiveScan(input, exclusive_output, initial_value, scan_op, block_aggregate);
-    }
-
-
-    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input item
-        T               &exclusive_output,  ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
-    {
-        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
-        T inclusive_output;
-        OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).Scan(
-            input, inclusive_output, exclusive_output, scan_op);
-
-        // Share outer warp total
-        if (lane_id == OUTER_WARP_THREADS - 1)
-            temp_storage.warp_aggregates[warp_id] = inclusive_output;
-
-        CTA_SYNC();
-
-        if (linear_tid < INNER_WARP_THREADS)
-        {
-            T outer_warp_input = temp_storage.warp_aggregates[linear_tid];
-            T outer_warp_exclusive;
-
-            InnerWarpScanT(temp_storage.aliasable.inner_warp_scan).ExclusiveScan(
-                outer_warp_input, outer_warp_exclusive, scan_op, block_aggregate);
-
-            temp_storage.block_aggregate                = block_aggregate;
-            temp_storage.warp_aggregates[linear_tid]    = outer_warp_exclusive;
-        }
-
-        CTA_SYNC();
-
-        if (warp_id != 0)
-        {
-            // Retrieve block aggregate
-            block_aggregate = temp_storage.block_aggregate;
-
-            // Apply warp prefix to our lane's partial
-            T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id];
-            exclusive_output = scan_op(outer_warp_exclusive, exclusive_output);
-            if (lane_id == 0)
-                exclusive_output = outer_warp_exclusive;
-        }
-    }
-
-
-    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input items
-        T               &exclusive_output,  ///< [out] Calling thread's output items (may be aliased to \p input)
-        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
-    {
-        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
-        T inclusive_output;
-        OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).Scan(
-            input, inclusive_output, exclusive_output, scan_op);
-
-        // Share outer warp total
-        if (lane_id == OUTER_WARP_THREADS - 1)
-        {
-            temp_storage.warp_aggregates[warp_id] = inclusive_output;
-        }
-
-        CTA_SYNC();
-
-        if (linear_tid < INNER_WARP_THREADS)
-        {
-            T outer_warp_input = temp_storage.warp_aggregates[linear_tid];
-            T outer_warp_exclusive;
-
-            InnerWarpScanT(temp_storage.aliasable.inner_warp_scan).ExclusiveScan(
-                outer_warp_input, outer_warp_exclusive, initial_value, scan_op, block_aggregate);
-
-            temp_storage.block_aggregate                = block_aggregate;
-            temp_storage.warp_aggregates[linear_tid]    = outer_warp_exclusive;
-        }
-
-        CTA_SYNC();
-
-        // Retrieve block aggregate
-        block_aggregate = temp_storage.block_aggregate;
-
-        // Apply warp prefix to our lane's partial
-        T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id];
-        exclusive_output = scan_op(outer_warp_exclusive, exclusive_output);
-        if (lane_id == 0)
-            exclusive_output = outer_warp_exclusive;
-    }
-
-
-    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  The call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.
-    template <
-        typename ScanOp,
-        typename BlockPrefixCallbackOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T                       input,                          ///< [in] Calling thread's input item
-        T                       &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp                  scan_op,                        ///< [in] Binary scan operator
-        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
-    {
-        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
-        T inclusive_output;
-        OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).Scan(
-            input, inclusive_output, exclusive_output, scan_op);
-
-        // Share outer warp total
-        if (lane_id == OUTER_WARP_THREADS - 1)
-            temp_storage.warp_aggregates[warp_id] = inclusive_output;
-
-        CTA_SYNC();
-
-        if (linear_tid < INNER_WARP_THREADS)
-        {
-            InnerWarpScanT inner_scan(temp_storage.aliasable.inner_warp_scan);
-
-            T upsweep = temp_storage.warp_aggregates[linear_tid];
-            T downsweep_prefix, block_aggregate;
-
-            inner_scan.ExclusiveScan(upsweep, downsweep_prefix, scan_op, block_aggregate);
-
-            // Use callback functor to get block prefix in lane0 and then broadcast to other lanes
-            T block_prefix = block_prefix_callback_op(block_aggregate);
-            block_prefix = inner_scan.Broadcast(block_prefix, 0);
-
-            downsweep_prefix = scan_op(block_prefix, downsweep_prefix);
-            if (linear_tid == 0)
-                downsweep_prefix = block_prefix;
-
-            temp_storage.warp_aggregates[linear_tid] = downsweep_prefix;
-        }
-
-        CTA_SYNC();
-
-        // Apply warp prefix to our lane's partial (or assign it if partial is invalid)
-        T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id];
-        exclusive_output = scan_op(outer_warp_exclusive, exclusive_output);
-        if (lane_id == 0)
-            exclusive_output = outer_warp_exclusive;
-    }
-
-
-    //---------------------------------------------------------------------
-    // Inclusive scans
-    //---------------------------------------------------------------------
-
-    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
-    template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &inclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op)                        ///< [in] Binary scan operator
-    {
-        T block_aggregate;
-        InclusiveScan(input, inclusive_output, scan_op, block_aggregate);
-    }
-
-
-    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-    template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &inclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op,                        ///< [in] Binary scan operator
-        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
-    {
-        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
-        OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).InclusiveScan(
-            input, inclusive_output, scan_op);
-
-        // Share outer warp total
-        if (lane_id == OUTER_WARP_THREADS - 1)
-            temp_storage.warp_aggregates[warp_id] = inclusive_output;
-
-        CTA_SYNC();
-
-        if (linear_tid < INNER_WARP_THREADS)
-        {
-            T outer_warp_input = temp_storage.warp_aggregates[linear_tid];
-            T outer_warp_exclusive;
-
-            InnerWarpScanT(temp_storage.aliasable.inner_warp_scan).ExclusiveScan(
-                outer_warp_input, outer_warp_exclusive, scan_op, block_aggregate);
-
-            temp_storage.block_aggregate                = block_aggregate;
-            temp_storage.warp_aggregates[linear_tid]    = outer_warp_exclusive;
-        }
-
-        CTA_SYNC();
-
-        if (warp_id != 0)
-        {
-            // Retrieve block aggregate
-            block_aggregate = temp_storage.block_aggregate;
-
-            // Apply warp prefix to our lane's partial
-            T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id];
-            inclusive_output = scan_op(outer_warp_exclusive, inclusive_output);
-        }
-    }
-
-
-    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.
-    template <
-        typename ScanOp,
-        typename BlockPrefixCallbackOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T                       input,                          ///< [in] Calling thread's input item
-        T                       &inclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp                  scan_op,                        ///< [in] Binary scan operator
-        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
-    {
-        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
-        OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).InclusiveScan(
-            input, inclusive_output, scan_op);
-
-        // Share outer warp total
-        if (lane_id == OUTER_WARP_THREADS - 1)
-            temp_storage.warp_aggregates[warp_id] = inclusive_output;
-
-        CTA_SYNC();
-
-        if (linear_tid < INNER_WARP_THREADS)
-        {
-            InnerWarpScanT inner_scan(temp_storage.aliasable.inner_warp_scan);
-
-            T upsweep = temp_storage.warp_aggregates[linear_tid];
-            T downsweep_prefix, block_aggregate;
-            inner_scan.ExclusiveScan(upsweep, downsweep_prefix, scan_op, block_aggregate);
-
-            // Use callback functor to get block prefix in lane0 and then broadcast to other lanes
-            T block_prefix = block_prefix_callback_op(block_aggregate);
-            block_prefix = inner_scan.Broadcast(block_prefix, 0);
-
-            downsweep_prefix = scan_op(block_prefix, downsweep_prefix);
-            if (linear_tid == 0)
-                downsweep_prefix = block_prefix;
-
-            temp_storage.warp_aggregates[linear_tid]    = downsweep_prefix;
-        }
-
-        CTA_SYNC();
-
-        // Apply warp prefix to our lane's partial
-        T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id];
-        inclusive_output = scan_op(outer_warp_exclusive, inclusive_output);
-    }
-
-
-};
-
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/cub.cuh b/thrust/system/cuda/detail/cub/cub.cuh
deleted file mode 100644
index 3ece0f658..000000000
--- a/thrust/system/cuda/detail/cub/cub.cuh
+++ /dev/null
@@ -1,95 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * CUB umbrella include file
- */
-
-#pragma once
-
-
-// Block
-#include "block/block_histogram.cuh"
-#include "block/block_discontinuity.cuh"
-#include "block/block_exchange.cuh"
-#include "block/block_load.cuh"
-#include "block/block_radix_rank.cuh"
-#include "block/block_radix_sort.cuh"
-#include "block/block_reduce.cuh"
-#include "block/block_scan.cuh"
-#include "block/block_store.cuh"
-//#include "block/block_shift.cuh"
-
-// Device
-#include "device/device_histogram.cuh"
-#include "device/device_partition.cuh"
-#include "device/device_radix_sort.cuh"
-#include "device/device_reduce.cuh"
-#include "device/device_run_length_encode.cuh"
-#include "device/device_scan.cuh"
-#include "device/device_segmented_radix_sort.cuh"
-#include "device/device_segmented_reduce.cuh"
-#include "device/device_select.cuh"
-#include "device/device_spmv.cuh"
-
-// Grid
-//#include "grid/grid_barrier.cuh"
-#include "grid/grid_even_share.cuh"
-#include "grid/grid_mapping.cuh"
-#include "grid/grid_queue.cuh"
-
-// Thread
-#include "thread/thread_load.cuh"
-#include "thread/thread_operators.cuh"
-#include "thread/thread_reduce.cuh"
-#include "thread/thread_scan.cuh"
-#include "thread/thread_store.cuh"
-
-// Warp
-#include "warp/warp_reduce.cuh"
-#include "warp/warp_scan.cuh"
-
-// Iterator
-#include "iterator/arg_index_input_iterator.cuh"
-#include "iterator/cache_modified_input_iterator.cuh"
-#include "iterator/cache_modified_output_iterator.cuh"
-#include "iterator/constant_input_iterator.cuh"
-#include "iterator/counting_input_iterator.cuh"
-#include "iterator/tex_obj_input_iterator.cuh"
-#include "iterator/tex_ref_input_iterator.cuh"
-#include "iterator/transform_input_iterator.cuh"
-
-// Util
-#include "util_arch.cuh"
-#include "util_debug.cuh"
-#include "util_device.cuh"
-#include "util_macro.cuh"
-#include "util_ptx.cuh"
-#include "util_type.cuh"
-
diff --git a/thrust/system/cuda/detail/cub/device/device_histogram.cuh b/thrust/system/cuda/detail/cub/device/device_histogram.cuh
deleted file mode 100644
index 259bcad32..000000000
--- a/thrust/system/cuda/detail/cub/device/device_histogram.cuh
+++ /dev/null
@@ -1,866 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within device-accessible memory.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-#include <limits>
-
-#include "dispatch/dispatch_histogram.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \brief DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within device-accessible memory. ![](histogram_logo.png)
- * \ingroup SingleModule
- *
- * \par Overview
- * A <a href="http://en.wikipedia.org/wiki/Histogram"><em>histogram</em></a>
- * counts the number of observations that fall into each of the disjoint categories (known as <em>bins</em>).
- *
- * \par Usage Considerations
- * \cdp_class{DeviceHistogram}
- *
- */
-struct DeviceHistogram
-{
-    /******************************************************************//**
-     * \name Evenly-segmented bin ranges
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Computes an intensity histogram from a sequence of data samples using equal-width bins.
-     *
-     * \par
-     * - The number of histogram bins is (\p num_levels - 1)
-     * - All bins comprise the same width of sample values: (\p upper_level - \p lower_level) / (\p num_levels - 1)
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the computation of a six-bin histogram
-     * from a sequence of float samples
-     *
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input samples and
-     * // output histogram
-     * int      num_samples;    // e.g., 10
-     * float*   d_samples;      // e.g., [2.2, 6.0, 7.1, 2.9, 3.5, 0.3, 2.9, 2.0, 6.1, 999.5]
-     * int*     d_histogram;    // e.g., [ -, -, -, -, -, -, -, -]
-     * int      num_levels;     // e.g., 7       (seven level boundaries for six bins)
-     * float    lower_level;    // e.g., 0.0     (lower sample value boundary of lowest bin)
-     * float    upper_level;    // e.g., 12.0    (upper sample value boundary of upper bin)
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void*    d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes,
-     *     d_samples, d_histogram, num_levels, lower_level, upper_level, num_samples);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Compute histograms
-     * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes,
-     *     d_samples, d_histogram, num_levels, lower_level, upper_level, num_samples);
-     *
-     * // d_histogram   <-- [1, 0, 5, 0, 3, 0, 0, 0];
-     *
-     * \endcode
-     *
-     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
-     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
-     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
-     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
-     */
-    template <
-        typename            SampleIteratorT,
-        typename            CounterT,
-        typename            LevelT,
-        typename            OffsetT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t HistogramEven(
-        void*               d_temp_storage,                             ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the input sequence of data samples.
-        CounterT*           d_histogram,                                ///< [out] The pointer to the histogram counter output array of length <tt>num_levels</tt> - 1.
-        int                 num_levels,                                 ///< [in] The number of boundaries (levels) for delineating histogram samples.  Implies that the number of bins is <tt>num_levels</tt> - 1.
-        LevelT              lower_level,                                ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin.
-        LevelT              upper_level,                                ///< [in] The upper sample value bound (exclusive) for the highest histogram bin.
-        OffsetT             num_samples,                                ///< [in] The number of input samples (i.e., the length of \p d_samples)
-        cudaStream_t        stream                  = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous       = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        /// The sample value type of the input iterator
-        typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
-
-        CounterT*           d_histogram1[1]     = {d_histogram};
-        int                 num_levels1[1]      = {num_levels};
-        LevelT              lower_level1[1]     = {lower_level};
-        LevelT              upper_level1[1]     = {upper_level};
-
-        return MultiHistogramEven<1, 1>(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_samples,
-            d_histogram1,
-            num_levels1,
-            lower_level1,
-            upper_level1,
-            num_samples,
-            1,
-            sizeof(SampleT) * num_samples,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Computes an intensity histogram from a sequence of data samples using equal-width bins.
-     *
-     * \par
-     * - A two-dimensional <em>region of interest</em> within \p d_samples can be specified
-     *   using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters.
-     * - The row stride must be a whole multiple of the sample data type
-     *   size, i.e., <tt>(row_stride_bytes % sizeof(SampleT)) == 0</tt>.
-     * - The number of histogram bins is (\p num_levels - 1)
-     * - All bins comprise the same width of sample values: (\p upper_level - \p lower_level) / (\p num_levels - 1)
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the computation of a six-bin histogram
-     * from a 2x5 region of interest within a flattened 2x7 array of float samples.
-     *
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input samples and
-     * // output histogram
-     * int      num_row_samples;    // e.g., 5
-     * int      num_rows;           // e.g., 2;
-     * size_t   row_stride_bytes;   // e.g., 7 * sizeof(float)
-     * float*   d_samples;          // e.g., [2.2, 6.0, 7.1, 2.9, 3.5,   -, -,
-     *                              //        0.3, 2.9, 2.0, 6.1, 999.5, -, -]
-     * int*     d_histogram;        // e.g., [ -, -, -, -, -, -, -, -]
-     * int      num_levels;         // e.g., 7       (seven level boundaries for six bins)
-     * float    lower_level;        // e.g., 0.0     (lower sample value boundary of lowest bin)
-     * float    upper_level;        // e.g., 12.0    (upper sample value boundary of upper bin)
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void*    d_temp_storage  = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes,
-     *     d_samples, d_histogram, num_levels, lower_level, upper_level,
-     *     num_row_samples, num_rows, row_stride_bytes);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Compute histograms
-     * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes, d_samples, d_histogram,
-     *     d_samples, d_histogram, num_levels, lower_level, upper_level,
-     *     num_row_samples, num_rows, row_stride_bytes);
-     *
-     * // d_histogram   <-- [1, 0, 5, 0, 3, 0, 0, 0];
-     *
-     * \endcode
-     *
-     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
-     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
-     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
-     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
-     */
-    template <
-        typename            SampleIteratorT,
-        typename            CounterT,
-        typename            LevelT,
-        typename            OffsetT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t HistogramEven(
-        void*               d_temp_storage,                             ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the input sequence of data samples.
-        CounterT*           d_histogram,                                ///< [out] The pointer to the histogram counter output array of length <tt>num_levels</tt> - 1.
-        int                 num_levels,                                 ///< [in] The number of boundaries (levels) for delineating histogram samples.  Implies that the number of bins is <tt>num_levels</tt> - 1.
-        LevelT              lower_level,                                ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin.
-        LevelT              upper_level,                                ///< [in] The upper sample value bound (exclusive) for the highest histogram bin.
-        OffsetT             num_row_samples,                            ///< [in] The number of data samples per row in the region of interest
-        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
-        size_t              row_stride_bytes,                           ///< [in] The number of bytes between starts of consecutive rows in the region of interest
-        cudaStream_t        stream                  = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous       = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        CounterT*           d_histogram1[1]     = {d_histogram};
-        int                 num_levels1[1]      = {num_levels};
-        LevelT              lower_level1[1]     = {lower_level};
-        LevelT              upper_level1[1]     = {upper_level};
-
-        return MultiHistogramEven<1, 1>(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_samples,
-            d_histogram1,
-            num_levels1,
-            lower_level1,
-            upper_level1,
-            num_row_samples,
-            num_rows,
-            row_stride_bytes,
-            stream,
-            debug_synchronous);
-    }
-
-    /**
-     * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using equal-width bins.
-     *
-     * \par
-     * - The input is a sequence of <em>pixel</em> structures, where each pixel comprises
-     *   a record of \p NUM_CHANNELS consecutive data samples (e.g., an <em>RGBA</em> pixel).
-     * - Of the \p NUM_CHANNELS specified, the function will only compute histograms
-     *   for the first \p NUM_ACTIVE_CHANNELS (e.g., only <em>RGB</em> histograms from <em>RGBA</em>
-     *   pixel samples).
-     * - The number of histogram bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
-     * - For channel<sub><em>i</em></sub>, the range of values for all histogram bins
-     *   have the same width: (<tt>upper_level[i]</tt> - <tt>lower_level[i]</tt>) / (<tt> num_levels[i]</tt> - 1)
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the computation of three 256-bin <em>RGB</em> histograms
-     * from a quad-channel sequence of <em>RGBA</em> pixels (8 bits per channel per pixel)
-     *
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input samples
-     * // and output histograms
-     * int              num_pixels;         // e.g., 5
-     * unsigned char*   d_samples;          // e.g., [(2, 6, 7, 5), (3, 0, 2, 1), (7, 0, 6, 2),
-     *                                      //        (0, 6, 7, 5), (3, 0, 2, 6)]
-     * int*             d_histogram[3];     // e.g., three device pointers to three device buffers,
-     *                                      //       each allocated with 256 integer counters
-     * int              num_levels[3];      // e.g., {257, 257, 257};
-     * unsigned int     lower_level[3];     // e.g., {0, 0, 0};
-     * unsigned int     upper_level[3];     // e.g., {256, 256, 256};
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void*    d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes,
-     *     d_samples, d_histogram, num_levels, lower_level, upper_level, num_pixels);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Compute histograms
-     * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes,
-     *     d_samples, d_histogram, num_levels, lower_level, upper_level, num_pixels);
-     *
-     * // d_histogram   <-- [ [1, 0, 1, 2, 0, 0, 0, 1, 0, 0, 0, ..., 0],
-     * //                     [0, 3, 0, 0, 0, 0, 2, 0, 0, 0, 0, ..., 0],
-     * //                     [0, 0, 2, 0, 0, 0, 1, 2, 0, 0, 0, ..., 0] ]
-     *
-     * \endcode
-     *
-     * \tparam NUM_CHANNELS             Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
-     * \tparam NUM_ACTIVE_CHANNELS      <b>[inferred]</b> Number of channels actively being histogrammed
-     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
-     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
-     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
-     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
-     */
-    template <
-        int                 NUM_CHANNELS,
-        int                 NUM_ACTIVE_CHANNELS,
-        typename            SampleIteratorT,
-        typename            CounterT,
-        typename            LevelT,
-        typename            OffsetT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t MultiHistogramEven(
-        void*               d_temp_storage,                             ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four <em>RGBA</em> 8-bit samples).
-        CounterT*           d_histogram[NUM_ACTIVE_CHANNELS],           ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histogram[i]</tt> should be <tt>num_levels[i]</tt> - 1.
-        int                 num_levels[NUM_ACTIVE_CHANNELS],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
-        LevelT              lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
-        LevelT              upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
-        OffsetT             num_pixels,                                 ///< [in] The number of multi-channel pixels (i.e., the length of \p d_samples / NUM_CHANNELS)
-        cudaStream_t        stream                  = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous       = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        /// The sample value type of the input iterator
-        typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
-
-        return MultiHistogramEven<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_samples,
-            d_histogram,
-            num_levels,
-            lower_level,
-            upper_level,
-            num_pixels,
-            1,
-            sizeof(SampleT) * NUM_CHANNELS * num_pixels,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using equal-width bins.
-     *
-     * \par
-     * - The input is a sequence of <em>pixel</em> structures, where each pixel comprises
-     *   a record of \p NUM_CHANNELS consecutive data samples (e.g., an <em>RGBA</em> pixel).
-     * - Of the \p NUM_CHANNELS specified, the function will only compute histograms
-     *   for the first \p NUM_ACTIVE_CHANNELS (e.g., only <em>RGB</em> histograms from <em>RGBA</em>
-     *   pixel samples).
-     * - A two-dimensional <em>region of interest</em> within \p d_samples can be specified
-     *   using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters.
-     * - The row stride must be a whole multiple of the sample data type
-     *   size, i.e., <tt>(row_stride_bytes % sizeof(SampleT)) == 0</tt>.
-     * - The number of histogram bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
-     * - For channel<sub><em>i</em></sub>, the range of values for all histogram bins
-     *   have the same width: (<tt>upper_level[i]</tt> - <tt>lower_level[i]</tt>) / (<tt> num_levels[i]</tt> - 1)
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the computation of three 256-bin <em>RGB</em> histograms from a 2x3 region of
-     * interest of within a flattened 2x4 array of quad-channel <em>RGBA</em> pixels (8 bits per channel per pixel).
-     *
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input samples
-     * // and output histograms
-     * int              num_row_pixels;     // e.g., 3
-     * int              num_rows;           // e.g., 2
-     * size_t           row_stride_bytes;   // e.g., 4 * sizeof(unsigned char) * NUM_CHANNELS
-     * unsigned char*   d_samples;          // e.g., [(2, 6, 7, 5), (3, 0, 2, 1), (7, 0, 6, 2), (-, -, -, -),
-     *                                      //        (0, 6, 7, 5), (3, 0, 2, 6), (1, 1, 1, 1), (-, -, -, -)]
-     * int*             d_histogram[3];     // e.g., three device pointers to three device buffers,
-     *                                      //       each allocated with 256 integer counters
-     * int              num_levels[3];      // e.g., {257, 257, 257};
-     * unsigned int     lower_level[3];     // e.g., {0, 0, 0};
-     * unsigned int     upper_level[3];     // e.g., {256, 256, 256};
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void*    d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes,
-     *     d_samples, d_histogram, num_levels, lower_level, upper_level,
-     *     num_row_pixels, num_rows, row_stride_bytes);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Compute histograms
-     * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes,
-     *     d_samples, d_histogram, num_levels, lower_level, upper_level,
-     *     num_row_pixels, num_rows, row_stride_bytes);
-     *
-     * // d_histogram   <-- [ [1, 1, 1, 2, 0, 0, 0, 1, 0, 0, 0, ..., 0],
-     * //                     [0, 4, 0, 0, 0, 0, 2, 0, 0, 0, 0, ..., 0],
-     * //                     [0, 1, 2, 0, 0, 0, 1, 2, 0, 0, 0, ..., 0] ]
-     *
-     * \endcode
-     *
-     * \tparam NUM_CHANNELS             Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
-     * \tparam NUM_ACTIVE_CHANNELS      <b>[inferred]</b> Number of channels actively being histogrammed
-     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
-     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
-     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
-     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
-     */
-    template <
-        int                 NUM_CHANNELS,
-        int                 NUM_ACTIVE_CHANNELS,
-        typename            SampleIteratorT,
-        typename            CounterT,
-        typename            LevelT,
-        typename            OffsetT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t MultiHistogramEven(
-        void*               d_temp_storage,                             ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four <em>RGBA</em> 8-bit samples).
-        CounterT*           d_histogram[NUM_ACTIVE_CHANNELS],           ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histogram[i]</tt> should be <tt>num_levels[i]</tt> - 1.
-        int                 num_levels[NUM_ACTIVE_CHANNELS],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
-        LevelT              lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
-        LevelT              upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
-        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
-        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
-        size_t              row_stride_bytes,                           ///< [in] The number of bytes between starts of consecutive rows in the region of interest
-        cudaStream_t        stream                  = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous       = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        /// The sample value type of the input iterator
-        typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
-        Int2Type<sizeof(SampleT) == 1> is_byte_sample;
-
-        if ((sizeof(OffsetT) > sizeof(int)) &&
-            ((unsigned long long) (num_rows * row_stride_bytes) < (unsigned long long) std::numeric_limits<int>::max()))
-        {
-            // Down-convert OffsetT data type
-
-
-            return DipatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, int>::DispatchEven(
-                d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, lower_level, upper_level,
-                (int) num_row_pixels, (int) num_rows, (int) (row_stride_bytes / sizeof(SampleT)),
-                stream, debug_synchronous, is_byte_sample);
-        }
-
-        return DipatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, OffsetT>::DispatchEven(
-            d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, lower_level, upper_level,
-            num_row_pixels, num_rows, (OffsetT) (row_stride_bytes / sizeof(SampleT)),
-            stream, debug_synchronous, is_byte_sample);
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Custom bin ranges
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Computes an intensity histogram from a sequence of data samples using the specified bin boundary levels.
-     *
-     * \par
-     * - The number of histogram bins is (\p num_levels - 1)
-     * - The value range for bin<sub><em>i</em></sub> is [<tt>level[i]</tt>, <tt>level[i+1]</tt>)
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the computation of an six-bin histogram
-     * from a sequence of float samples
-     *
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input samples and
-     * // output histogram
-     * int      num_samples;    // e.g., 10
-     * float*   d_samples;      // e.g., [2.2, 6.0, 7.1, 2.9, 3.5, 0.3, 2.9, 2.0, 6.1, 999.5]
-     * int*     d_histogram;    // e.g., [ -, -, -, -, -, -, -, -]
-     * int      num_levels      // e.g., 7 (seven level boundaries for six bins)
-     * float*   d_levels;       // e.g., [0.0, 2.0, 4.0, 6.0, 8.0, 12.0, 16.0]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void*    d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes,
-     *     d_samples, d_histogram, num_levels, d_levels, num_samples);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Compute histograms
-     * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes,
-     *     d_samples, d_histogram, num_levels, d_levels, num_samples);
-     *
-     * // d_histogram   <-- [1, 0, 5, 0, 3, 0, 0, 0];
-     *
-     * \endcode
-     *
-     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
-     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
-     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
-     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
-     */
-    template <
-        typename            SampleIteratorT,
-        typename            CounterT,
-        typename            LevelT,
-        typename            OffsetT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t HistogramRange(
-        void*               d_temp_storage,                         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&             temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        SampleIteratorT     d_samples,                              ///< [in] The pointer to the input sequence of data samples.
-        CounterT*           d_histogram,                            ///< [out] The pointer to the histogram counter output array of length <tt>num_levels</tt> - 1.
-        int                 num_levels,                             ///< [in] The number of boundaries (levels) for delineating histogram samples.  Implies that the number of bins is <tt>num_levels</tt> - 1.
-        LevelT*             d_levels,                               ///< [in] The pointer to the array of boundaries (levels).  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
-        OffsetT             num_samples,                            ///< [in] The number of data samples per row in the region of interest
-        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        /// The sample value type of the input iterator
-        typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
-
-        CounterT*           d_histogram1[1] = {d_histogram};
-        int                 num_levels1[1]  = {num_levels};
-        LevelT*             d_levels1[1]    = {d_levels};
-
-        return MultiHistogramRange<1, 1>(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_samples,
-            d_histogram1,
-            num_levels1,
-            d_levels1,
-            num_samples,
-            1,
-            sizeof(SampleT) * num_samples,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Computes an intensity histogram from a sequence of data samples using the specified bin boundary levels.
-     *
-     * \par
-     * - A two-dimensional <em>region of interest</em> within \p d_samples can be specified
-     *   using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters.
-     * - The row stride must be a whole multiple of the sample data type
-     *   size, i.e., <tt>(row_stride_bytes % sizeof(SampleT)) == 0</tt>.
-     * - The number of histogram bins is (\p num_levels - 1)
-     * - The value range for bin<sub><em>i</em></sub> is [<tt>level[i]</tt>, <tt>level[i+1]</tt>)
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the computation of a six-bin histogram
-     * from a 2x5 region of interest within a flattened 2x7 array of float samples.
-     *
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input samples and
-     * // output histogram
-     * int      num_row_samples;    // e.g., 5
-     * int      num_rows;           // e.g., 2;
-     * int      row_stride_bytes;   // e.g., 7 * sizeof(float)
-     * float*   d_samples;          // e.g., [2.2, 6.0, 7.1, 2.9, 3.5,   -, -,
-     *                              //        0.3, 2.9, 2.0, 6.1, 999.5, -, -]
-     * int*     d_histogram;        // e.g., [ , , , , , , , ]
-     * int      num_levels          // e.g., 7 (seven level boundaries for six bins)
-     * float    *d_levels;          // e.g., [0.0, 2.0, 4.0, 6.0, 8.0, 12.0, 16.0]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void*    d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes,
-     *     d_samples, d_histogram, num_levels, d_levels,
-     *     num_row_samples, num_rows, row_stride_bytes);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Compute histograms
-     * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes,
-     *     d_samples, d_histogram, num_levels, d_levels,
-     *     num_row_samples, num_rows, row_stride_bytes);
-     *
-     * // d_histogram   <-- [1, 0, 5, 0, 3, 0, 0, 0];
-     *
-     * \endcode
-     *
-     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
-     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
-     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
-     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
-     */
-    template <
-        typename            SampleIteratorT,
-        typename            CounterT,
-        typename            LevelT,
-        typename            OffsetT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t HistogramRange(
-        void*               d_temp_storage,                         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&             temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        SampleIteratorT     d_samples,                              ///< [in] The pointer to the input sequence of data samples.
-        CounterT*           d_histogram,                            ///< [out] The pointer to the histogram counter output array of length <tt>num_levels</tt> - 1.
-        int                 num_levels,                             ///< [in] The number of boundaries (levels) for delineating histogram samples.  Implies that the number of bins is <tt>num_levels</tt> - 1.
-        LevelT*             d_levels,                               ///< [in] The pointer to the array of boundaries (levels).  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
-        OffsetT             num_row_samples,                        ///< [in] The number of data samples per row in the region of interest
-        OffsetT             num_rows,                               ///< [in] The number of rows in the region of interest
-        size_t              row_stride_bytes,                       ///< [in] The number of bytes between starts of consecutive rows in the region of interest
-        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        CounterT*           d_histogram1[1]     = {d_histogram};
-        int                 num_levels1[1]      = {num_levels};
-        LevelT*             d_levels1[1]        = {d_levels};
-
-        return MultiHistogramRange<1, 1>(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_samples,
-            d_histogram1,
-            num_levels1,
-            d_levels1,
-            num_row_samples,
-            num_rows,
-            row_stride_bytes,
-            stream,
-            debug_synchronous);
-    }
-
-    /**
-     * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using the specified bin boundary levels.
-     *
-     * \par
-     * - The input is a sequence of <em>pixel</em> structures, where each pixel comprises
-     *   a record of \p NUM_CHANNELS consecutive data samples (e.g., an <em>RGBA</em> pixel).
-     * - Of the \p NUM_CHANNELS specified, the function will only compute histograms
-     *   for the first \p NUM_ACTIVE_CHANNELS (e.g., <em>RGB</em> histograms from <em>RGBA</em>
-     *   pixel samples).
-     * - The number of histogram bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
-     * - For channel<sub><em>i</em></sub>, the range of values for all histogram bins
-     *   have the same width: (<tt>upper_level[i]</tt> - <tt>lower_level[i]</tt>) / (<tt> num_levels[i]</tt> - 1)
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the computation of three 4-bin <em>RGB</em> histograms
-     * from a quad-channel sequence of <em>RGBA</em> pixels (8 bits per channel per pixel)
-     *
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input samples
-     * // and output histograms
-     * int            num_pixels;       // e.g., 5
-     * unsigned char  *d_samples;       // e.g., [(2, 6, 7, 5),(3, 0, 2, 1),(7, 0, 6, 2),
-     *                                  //        (0, 6, 7, 5),(3, 0, 2, 6)]
-     * unsigned int   *d_histogram[3];  // e.g., [[ -, -, -, -],[ -, -, -, -],[ -, -, -, -]];
-     * int            num_levels[3];    // e.g., {5, 5, 5};
-     * unsigned int   *d_levels[3];     // e.g., [ [0, 2, 4, 6, 8],
-     *                                  //         [0, 2, 4, 6, 8],
-     *                                  //         [0, 2, 4, 6, 8] ];
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void*    d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes,
-     *     d_samples, d_histogram, num_levels, d_levels, num_pixels);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Compute histograms
-     * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes,
-     *     d_samples, d_histogram, num_levels, d_levels, num_pixels);
-     *
-     * // d_histogram   <-- [ [1, 3, 0, 1],
-     * //                     [3, 0, 0, 2],
-     * //                     [0, 2, 0, 3] ]
-     *
-     * \endcode
-     *
-     * \tparam NUM_CHANNELS             Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
-     * \tparam NUM_ACTIVE_CHANNELS      <b>[inferred]</b> Number of channels actively being histogrammed
-     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
-     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
-     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
-     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
-     */
-    template <
-        int                 NUM_CHANNELS,
-        int                 NUM_ACTIVE_CHANNELS,
-        typename            SampleIteratorT,
-        typename            CounterT,
-        typename            LevelT,
-        typename            OffsetT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t MultiHistogramRange(
-        void*               d_temp_storage,                         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&             temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        SampleIteratorT     d_samples,                              ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four <em>RGBA</em> 8-bit samples).
-        CounterT*           d_histogram[NUM_ACTIVE_CHANNELS],       ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histogram[i]</tt> should be <tt>num_levels[i]</tt> - 1.
-        int                 num_levels[NUM_ACTIVE_CHANNELS],        ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
-        LevelT*             d_levels[NUM_ACTIVE_CHANNELS],          ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
-        OffsetT             num_pixels,                             ///< [in] The number of multi-channel pixels (i.e., the length of \p d_samples / NUM_CHANNELS)
-        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        /// The sample value type of the input iterator
-        typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
-
-        return MultiHistogramRange<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_samples,
-            d_histogram,
-            num_levels,
-            d_levels,
-            num_pixels,
-            1,
-            sizeof(SampleT) * NUM_CHANNELS * num_pixels,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using the specified bin boundary levels.
-     *
-     * \par
-     * - The input is a sequence of <em>pixel</em> structures, where each pixel comprises
-     *   a record of \p NUM_CHANNELS consecutive data samples (e.g., an <em>RGBA</em> pixel).
-     * - Of the \p NUM_CHANNELS specified, the function will only compute histograms
-     *   for the first \p NUM_ACTIVE_CHANNELS (e.g., <em>RGB</em> histograms from <em>RGBA</em>
-     *   pixel samples).
-     * - A two-dimensional <em>region of interest</em> within \p d_samples can be specified
-     *   using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters.
-     * - The row stride must be a whole multiple of the sample data type
-     *   size, i.e., <tt>(row_stride_bytes % sizeof(SampleT)) == 0</tt>.
-     * - The number of histogram bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
-     * - For channel<sub><em>i</em></sub>, the range of values for all histogram bins
-     *   have the same width: (<tt>upper_level[i]</tt> - <tt>lower_level[i]</tt>) / (<tt> num_levels[i]</tt> - 1)
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the computation of three 4-bin <em>RGB</em> histograms from a 2x3 region of
-     * interest of within a flattened 2x4 array of quad-channel <em>RGBA</em> pixels (8 bits per channel per pixel).
-     *
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input samples
-     * // and output histograms
-     * int              num_row_pixels;     // e.g., 3
-     * int              num_rows;           // e.g., 2
-     * size_t           row_stride_bytes;   // e.g., 4 * sizeof(unsigned char) * NUM_CHANNELS
-     * unsigned char*   d_samples;          // e.g., [(2, 6, 7, 5),(3, 0, 2, 1),(1, 1, 1, 1),(-, -, -, -),
-     *                                      //        (7, 0, 6, 2),(0, 6, 7, 5),(3, 0, 2, 6),(-, -, -, -)]
-     * int*             d_histogram[3];     // e.g., [[ -, -, -, -],[ -, -, -, -],[ -, -, -, -]];
-     * int              num_levels[3];      // e.g., {5, 5, 5};
-     * unsigned int*    d_levels[3];        // e.g., [ [0, 2, 4, 6, 8],
-     *                                      //         [0, 2, 4, 6, 8],
-     *                                      //         [0, 2, 4, 6, 8] ];
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void*    d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes,
-     *     d_samples, d_histogram, num_levels, d_levels, num_row_pixels, num_rows, row_stride_bytes);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Compute histograms
-     * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes,
-     *     d_samples, d_histogram, num_levels, d_levels, num_row_pixels, num_rows, row_stride_bytes);
-     *
-     * // d_histogram   <-- [ [2, 3, 0, 1],
-     * //                     [3, 0, 0, 2],
-     * //                     [1, 2, 0, 3] ]
-     *
-     * \endcode
-     *
-     * \tparam NUM_CHANNELS             Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
-     * \tparam NUM_ACTIVE_CHANNELS      <b>[inferred]</b> Number of channels actively being histogrammed
-     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
-     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
-     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
-     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
-     */
-    template <
-        int                 NUM_CHANNELS,
-        int                 NUM_ACTIVE_CHANNELS,
-        typename            SampleIteratorT,
-        typename            CounterT,
-        typename            LevelT,
-        typename            OffsetT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t MultiHistogramRange(
-        void*               d_temp_storage,                         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&             temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        SampleIteratorT     d_samples,                              ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four <em>RGBA</em> 8-bit samples).
-        CounterT*           d_histogram[NUM_ACTIVE_CHANNELS],       ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histogram[i]</tt> should be <tt>num_levels[i]</tt> - 1.
-        int                 num_levels[NUM_ACTIVE_CHANNELS],        ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
-        LevelT*             d_levels[NUM_ACTIVE_CHANNELS],          ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
-        OffsetT             num_row_pixels,                         ///< [in] The number of multi-channel pixels per row in the region of interest
-        OffsetT             num_rows,                               ///< [in] The number of rows in the region of interest
-        size_t              row_stride_bytes,                       ///< [in] The number of bytes between starts of consecutive rows in the region of interest
-        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        /// The sample value type of the input iterator
-        typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
-        Int2Type<sizeof(SampleT) == 1> is_byte_sample;
-
-        if ((sizeof(OffsetT) > sizeof(int)) &&
-            ((unsigned long long) (num_rows * row_stride_bytes) < (unsigned long long) std::numeric_limits<int>::max()))
-        {
-            // Down-convert OffsetT data type
-            return DipatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, int>::DispatchRange(
-                d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels,
-                (int) num_row_pixels, (int) num_rows, (int) (row_stride_bytes / sizeof(SampleT)),
-                stream, debug_synchronous, is_byte_sample);
-        }
-
-        return DipatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, OffsetT>::DispatchRange(
-            d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels,
-            num_row_pixels, num_rows, (OffsetT) (row_stride_bytes / sizeof(SampleT)),
-            stream, debug_synchronous, is_byte_sample);
-    }
-
-
-
-    //@}  end member group
-};
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/thrust/system/cuda/detail/cub/device/device_partition.cuh b/thrust/system/cuda/detail/cub/device/device_partition.cuh
deleted file mode 100644
index 178cfe938..000000000
--- a/thrust/system/cuda/detail/cub/device/device_partition.cuh
+++ /dev/null
@@ -1,273 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DevicePartition provides device-wide, parallel operations for partitioning sequences of data items residing within device-accessible memory.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-
-#include "dispatch/dispatch_select_if.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \brief DevicePartition provides device-wide, parallel operations for partitioning sequences of data items residing within device-accessible memory. ![](partition_logo.png)
- * \ingroup SingleModule
- *
- * \par Overview
- * These operations apply a selection criterion to construct a partitioned output sequence from items selected/unselected from
- * a specified input sequence.
- *
- * \par Usage Considerations
- * \cdp_class{DevicePartition}
- *
- * \par Performance
- * \linear_performance{partition}
- *
- * \par
- * The following chart illustrates DevicePartition::If
- * performance across different CUDA architectures for \p int32 items,
- * where 50% of the items are randomly selected for the first partition.
- * \plots_below
- *
- * \image html partition_if_int32_50_percent.png
- *
- */
-struct DevicePartition
-{
-    /**
-     * \brief Uses the \p d_flags sequence to split the corresponding items from \p d_in into a partitioned sequence \p d_out.  The total number of items copied into the first partition is written to \p d_num_selected_out. ![](partition_flags_logo.png)
-     *
-     * \par
-     * - The value type of \p d_flags must be castable to \p bool (e.g., \p bool, \p char, \p int, etc.).
-     * - Copies of the selected items are compacted into \p d_out and maintain their original
-     *   relative ordering, however copies of the unselected items are compacted into the
-     *   rear of \p d_out in reverse order.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the compaction of items selected from an \p int device vector.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>       // or equivalently <cub/device/device_partition.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input, flags, and output
-     * int  num_items;              // e.g., 8
-     * int  *d_in;                  // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
-     * char *d_flags;               // e.g., [1, 0, 0, 1, 0, 1, 1, 0]
-     * int  *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
-     * int  *d_num_selected_out;    // e.g., [ ]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run selection
-     * cub::DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items);
-     *
-     * // d_out                 <-- [1, 4, 6, 7, 8, 5, 3, 2]
-     * // d_num_selected_out    <-- [4]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
-     * \tparam FlagIterator         <b>[inferred]</b> Random-access input iterator type for reading selection flags \iterator
-     * \tparam OutputIteratorT      <b>[inferred]</b> Random-access output iterator type for writing output items \iterator
-     * \tparam NumSelectedIteratorT  <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
-     */
-    template <
-        typename                    InputIteratorT,
-        typename                    FlagIterator,
-        typename                    OutputIteratorT,
-        typename                    NumSelectedIteratorT>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Flagged(
-        void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
-        FlagIterator                d_flags,                        ///< [in] Pointer to the input sequence of selection flags
-        OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of partitioned data items
-        NumSelectedIteratorT        d_num_selected_out,             ///< [out] Pointer to the output total number of items selected (i.e., the offset of the unselected partition)
-        int                         num_items,                      ///< [in] Total number of items to select from
-        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        typedef int                     OffsetT;         // Signed integer type for global offsets
-        typedef NullType                SelectOp;       // Selection op (not used)
-        typedef NullType                EqualityOp;     // Equality operator (not used)
-
-        return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, true>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_flags,
-            d_out,
-            d_num_selected_out,
-            SelectOp(),
-            EqualityOp(),
-            num_items,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Uses the \p select_op functor to split the corresponding items from \p d_in into a partitioned sequence \p d_out.  The total number of items copied into the first partition is written to \p d_num_selected_out. ![](partition_logo.png)
-     *
-     * \par
-     * - Copies of the selected items are compacted into \p d_out and maintain their original
-     *   relative ordering, however copies of the unselected items are compacted into the
-     *   rear of \p d_out in reverse order.
-     * - \devicestorage
-     *
-     * \par Performance
-     * The following charts illustrate saturated partition-if performance across different
-     * CUDA architectures for \p int32 and \p int64 items, respectively.  Items are
-     * selected for the first partition with 50% probability.
-     *
-     * \image html partition_if_int32_50_percent.png
-     * \image html partition_if_int64_50_percent.png
-     *
-     * \par
-     * The following charts are similar, but 5% selection probability for the first partition:
-     *
-     * \image html partition_if_int32_5_percent.png
-     * \image html partition_if_int64_5_percent.png
-     *
-     * \par Snippet
-     * The code snippet below illustrates the compaction of items selected from an \p int device vector.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_partition.cuh>
-     *
-     * // Functor type for selecting values less than some criteria
-     * struct LessThan
-     * {
-     *     int compare;
-     *
-     *     CUB_RUNTIME_FUNCTION __forceinline__
-     *     LessThan(int compare) : compare(compare) {}
-     *
-     *     CUB_RUNTIME_FUNCTION __forceinline__
-     *     bool operator()(const int &a) const {
-     *         return (a < compare);
-     *     }
-     * };
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int      num_items;              // e.g., 8
-     * int      *d_in;                  // e.g., [0, 2, 3, 9, 5, 2, 81, 8]
-     * int      *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
-     * int      *d_num_selected_out;    // e.g., [ ]
-     * LessThan select_op(7);
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run selection
-     * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op);
-     *
-     * // d_out                 <-- [0, 2, 3, 5, 2, 8, 81, 9]
-     * // d_num_selected_out    <-- [5]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
-     * \tparam OutputIteratorT      <b>[inferred]</b> Random-access output iterator type for writing output items \iterator
-     * \tparam NumSelectedIteratorT  <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
-     * \tparam SelectOp             <b>[inferred]</b> Selection functor type having member <tt>bool operator()(const T &a)</tt>
-     */
-    template <
-        typename                    InputIteratorT,
-        typename                    OutputIteratorT,
-        typename                    NumSelectedIteratorT,
-        typename                    SelectOp>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t If(
-        void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of partitioned data items
-        NumSelectedIteratorT        d_num_selected_out,             ///< [out] Pointer to the output total number of items selected (i.e., the offset of the unselected partition)
-        int                         num_items,                      ///< [in] Total number of items to select from
-        SelectOp                    select_op,                      ///< [in] Unary selection operator
-        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        typedef int                     OffsetT;         // Signed integer type for global offsets
-        typedef NullType*               FlagIterator;   // FlagT iterator type (not used)
-        typedef NullType                EqualityOp;     // Equality operator (not used)
-
-        return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, true>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            NULL,
-            d_out,
-            d_num_selected_out,
-            select_op,
-            EqualityOp(),
-            num_items,
-            stream,
-            debug_synchronous);
-    }
-
-};
-
-/**
- * \example example_device_partition_flagged.cu
- * \example example_device_partition_if.cu
- */
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/thrust/system/cuda/detail/cub/device/device_radix_sort.cuh b/thrust/system/cuda/detail/cub/device/device_radix_sort.cuh
deleted file mode 100644
index aead91103..000000000
--- a/thrust/system/cuda/detail/cub/device/device_radix_sort.cuh
+++ /dev/null
@@ -1,797 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within device-accessible memory.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-
-#include "dispatch/dispatch_radix_sort.cuh"
-#include "../util_arch.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \brief DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within device-accessible memory. ![](sorting_logo.png)
- * \ingroup SingleModule
- *
- * \par Overview
- * The [<em>radix sorting method</em>](http://en.wikipedia.org/wiki/Radix_sort) arranges
- * items into ascending (or descending) order.  The algorithm relies upon a positional representation for
- * keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits,
- * characters, etc.) specified from least-significant to most-significant.  For a
- * given input sequence of keys and a set of rules specifying a total ordering
- * of the symbolic alphabet, the radix sorting method produces a lexicographic
- * ordering of those keys.
- *
- * \par
- * DeviceRadixSort can sort all of the built-in C++ numeric primitive types
- * (<tt>unsigned char</tt>, \p int, \p double, etc.) as well as CUDA's \p __half
- * half-precision floating-point type.  Although the direct radix sorting
- * method can only be applied to unsigned integral types, DeviceRadixSort
- * is able to sort signed and floating-point types via simple bit-wise transformations
- * that ensure lexicographic key ordering.
- *
- * \par Usage Considerations
- * \cdp_class{DeviceRadixSort}
- *
- * \par Performance
- * \linear_performance{radix sort} The following chart illustrates DeviceRadixSort::SortKeys
- * performance across different CUDA architectures for uniform-random \p uint32 keys.
- * \plots_below
- *
- * \image html lsb_radix_sort_int32_keys.png
- *
- */
-struct DeviceRadixSort
-{
-
-    /******************************************************************//**
-     * \name KeyT-value pairs
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Sorts key-value pairs into ascending order. (~<em>2N </em>auxiliary storage required)
-     *
-     * \par
-     * - The contents of the input data are not altered by the sorting operation
-     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
-     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
-     * - \devicestorage
-     *
-     * \par Performance
-     * The following charts illustrate saturated sorting performance across different
-     * CUDA architectures for uniform-random <tt>uint32,uint32</tt> and
-     * <tt>uint64,uint64</tt> pairs, respectively.
-     *
-     * \image html lsb_radix_sort_int32_pairs.png
-     * \image html lsb_radix_sort_int64_pairs.png
-     *
-     * \par Snippet
-     * The code snippet below illustrates the sorting of a device vector of \p int keys
-     * with associated vector of \p int values.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for sorting data
-     * int  num_items;          // e.g., 7
-     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_keys_out;        // e.g., [        ...        ]
-     * int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
-     * int  *d_values_out;      // e.g., [        ...        ]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes,
-     *     d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sorting operation
-     * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes,
-     *     d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
-     *
-     * // d_keys_out            <-- [0, 3, 5, 6, 7, 8, 9]
-     * // d_values_out          <-- [5, 4, 3, 1, 2, 0, 6]
-     *
-     * \endcode
-     *
-     * \tparam KeyT      <b>[inferred]</b> KeyT type
-     * \tparam ValueT    <b>[inferred]</b> ValueT type
-     */
-    template <
-        typename            KeyT,
-        typename            ValueT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t SortPairs(
-        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        const KeyT          *d_keys_in,                             ///< [in] Pointer to the input data of key data to sort
-        KeyT                *d_keys_out,                            ///< [out] Pointer to the sorted output sequence of key data
-        const ValueT        *d_values_in,                           ///< [in] Pointer to the corresponding input sequence of associated value items
-        ValueT              *d_values_out,                          ///< [out] Pointer to the correspondingly-reordered output sequence of associated value items
-        int                 num_items,                              ///< [in] Number of items to sort
-        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
-        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
-        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        DoubleBuffer<KeyT>       d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
-        DoubleBuffer<ValueT>     d_values(const_cast<ValueT*>(d_values_in), d_values_out);
-
-        return DispatchRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys,
-            d_values,
-            num_items,
-            begin_bit,
-            end_bit,
-            false,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Sorts key-value pairs into ascending order. (~<em>N </em>auxiliary storage required)
-     *
-     * \par
-     * - The sorting operation is given a pair of key buffers and a corresponding
-     *   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
-     *   structure that indicates which of the two buffers is "current" (and thus
-     *   contains the input data to be sorted).
-     * - The contents of both buffers within each pair may be altered by the sorting
-     *   operation.
-     * - Upon completion, the sorting operation will update the "current" indicator
-     *   within each DoubleBuffer wrapper to reference which of the two buffers
-     *   now contains the sorted output sequence (a function of the number of key bits
-     *   specified and the targeted device architecture).
-     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
-     * - \devicestorageP
-     * - \devicestorage
-     *
-     * \par Performance
-     * The following charts illustrate saturated sorting performance across different
-     * CUDA architectures for uniform-random <tt>uint32,uint32</tt> and
-     * <tt>uint64,uint64</tt> pairs, respectively.
-     *
-     * \image html lsb_radix_sort_int32_pairs.png
-     * \image html lsb_radix_sort_int64_pairs.png
-     *
-     * \par Snippet
-     * The code snippet below illustrates the sorting of a device vector of \p int keys
-     * with associated vector of \p int values.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for sorting data
-     * int  num_items;          // e.g., 7
-     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_key_alt_buf;     // e.g., [        ...        ]
-     * int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
-     * int  *d_value_alt_buf;   // e.g., [        ...        ]
-     * ...
-     *
-     * // Create a set of DoubleBuffers to wrap pairs of device pointers
-     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
-     * cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sorting operation
-     * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
-     *
-     * // d_keys.Current()      <-- [0, 3, 5, 6, 7, 8, 9]
-     * // d_values.Current()    <-- [5, 4, 3, 1, 2, 0, 6]
-     *
-     * \endcode
-     *
-     * \tparam KeyT      <b>[inferred]</b> KeyT type
-     * \tparam ValueT    <b>[inferred]</b> ValueT type
-     */
-    template <
-        typename            KeyT,
-        typename            ValueT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t SortPairs(
-        void                    *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                  &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        DoubleBuffer<KeyT>      &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
-        DoubleBuffer<ValueT>    &d_values,                              ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
-        int                     num_items,                              ///< [in] Number of items to sort
-        int                     begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
-        int                     end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
-        cudaStream_t            stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                    debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        return DispatchRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys,
-            d_values,
-            num_items,
-            begin_bit,
-            end_bit,
-            true,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Sorts key-value pairs into descending order. (~<em>2N</em> auxiliary storage required).
-     *
-     * \par
-     * - The contents of the input data are not altered by the sorting operation
-     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
-     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
-     * - \devicestorage
-     *
-     * \par Performance
-     * Performance is similar to DeviceRadixSort::SortPairs.
-     *
-     * \par Snippet
-     * The code snippet below illustrates the sorting of a device vector of \p int keys
-     * with associated vector of \p int values.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for sorting data
-     * int  num_items;          // e.g., 7
-     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_keys_out;        // e.g., [        ...        ]
-     * int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
-     * int  *d_values_out;      // e.g., [        ...        ]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes,
-     *     d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sorting operation
-     * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes,
-     *     d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
-     *
-     * // d_keys_out            <-- [9, 8, 7, 6, 5, 3, 0]
-     * // d_values_out          <-- [6, 0, 2, 1, 3, 4, 5]
-     *
-     * \endcode
-     *
-     * \tparam KeyT      <b>[inferred]</b> KeyT type
-     * \tparam ValueT    <b>[inferred]</b> ValueT type
-     */
-    template <
-        typename            KeyT,
-        typename            ValueT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t SortPairsDescending(
-        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        const KeyT          *d_keys_in,                             ///< [in] Pointer to the input data of key data to sort
-        KeyT                *d_keys_out,                            ///< [out] Pointer to the sorted output sequence of key data
-        const ValueT        *d_values_in,                           ///< [in] Pointer to the corresponding input sequence of associated value items
-        ValueT              *d_values_out,                          ///< [out] Pointer to the correspondingly-reordered output sequence of associated value items
-        int                 num_items,                              ///< [in] Number of items to sort
-        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
-        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
-        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        DoubleBuffer<KeyT>       d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
-        DoubleBuffer<ValueT>     d_values(const_cast<ValueT*>(d_values_in), d_values_out);
-
-        return DispatchRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys,
-            d_values,
-            num_items,
-            begin_bit,
-            end_bit,
-            false,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Sorts key-value pairs into descending order. (~<em>N </em>auxiliary storage required).
-     *
-     * \par
-     * - The sorting operation is given a pair of key buffers and a corresponding
-     *   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
-     *   structure that indicates which of the two buffers is "current" (and thus
-     *   contains the input data to be sorted).
-     * - The contents of both buffers within each pair may be altered by the sorting
-     *   operation.
-     * - Upon completion, the sorting operation will update the "current" indicator
-     *   within each DoubleBuffer wrapper to reference which of the two buffers
-     *   now contains the sorted output sequence (a function of the number of key bits
-     *   specified and the targeted device architecture).
-     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
-     * - \devicestorageP
-     * - \devicestorage
-     *
-     * \par Performance
-     * Performance is similar to DeviceRadixSort::SortPairs.
-     *
-     * \par Snippet
-     * The code snippet below illustrates the sorting of a device vector of \p int keys
-     * with associated vector of \p int values.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for sorting data
-     * int  num_items;          // e.g., 7
-     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_key_alt_buf;     // e.g., [        ...        ]
-     * int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
-     * int  *d_value_alt_buf;   // e.g., [        ...        ]
-     * ...
-     *
-     * // Create a set of DoubleBuffers to wrap pairs of device pointers
-     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
-     * cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sorting operation
-     * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
-     *
-     * // d_keys.Current()      <-- [9, 8, 7, 6, 5, 3, 0]
-     * // d_values.Current()    <-- [6, 0, 2, 1, 3, 4, 5]
-     *
-     * \endcode
-     *
-     * \tparam KeyT      <b>[inferred]</b> KeyT type
-     * \tparam ValueT    <b>[inferred]</b> ValueT type
-     */
-    template <
-        typename            KeyT,
-        typename            ValueT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t SortPairsDescending(
-        void                    *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                  &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        DoubleBuffer<KeyT>      &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
-        DoubleBuffer<ValueT>    &d_values,                              ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
-        int                     num_items,                              ///< [in] Number of items to sort
-        int                     begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
-        int                     end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
-        cudaStream_t            stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                    debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        return DispatchRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys,
-            d_values,
-            num_items,
-            begin_bit,
-            end_bit,
-            true,
-            stream,
-            debug_synchronous);
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Keys-only
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Sorts keys into ascending order. (~<em>2N </em>auxiliary storage required)
-     *
-     * \par
-     * - The contents of the input data are not altered by the sorting operation
-     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
-     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
-     * - \devicestorage
-     *
-     * \par Performance
-     * The following charts illustrate saturated sorting performance across different
-     * CUDA architectures for uniform-random \p uint32 and \p uint64 keys, respectively.
-     *
-     * \image html lsb_radix_sort_int32_keys.png
-     * \image html lsb_radix_sort_int64_keys.png
-     *
-     * \par Snippet
-     * The code snippet below illustrates the sorting of a device vector of \p int keys.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for sorting data
-     * int  num_items;          // e.g., 7
-     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_keys_out;        // e.g., [        ...        ]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sorting operation
-     * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
-     *
-     * // d_keys_out            <-- [0, 3, 5, 6, 7, 8, 9]
-     *
-     * \endcode
-     *
-     * \tparam KeyT      <b>[inferred]</b> KeyT type
-     */
-    template <typename KeyT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t SortKeys(
-        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        const KeyT          *d_keys_in,                             ///< [in] Pointer to the input data of key data to sort
-        KeyT                *d_keys_out,                            ///< [out] Pointer to the sorted output sequence of key data
-        int                 num_items,                              ///< [in] Number of items to sort
-        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
-        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
-        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        // Null value type
-        DoubleBuffer<KeyT>      d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
-        DoubleBuffer<NullType>  d_values;
-
-        return DispatchRadixSort<false, KeyT, NullType, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys,
-            d_values,
-            num_items,
-            begin_bit,
-            end_bit,
-            false,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Sorts keys into ascending order. (~<em>N </em>auxiliary storage required).
-     *
-     * \par
-     * - The sorting operation is given a pair of key buffers managed by a
-     *   DoubleBuffer structure that indicates which of the two buffers is
-     *   "current" (and thus contains the input data to be sorted).
-     * - The contents of both buffers may be altered by the sorting operation.
-     * - Upon completion, the sorting operation will update the "current" indicator
-     *   within the DoubleBuffer wrapper to reference which of the two buffers
-     *   now contains the sorted output sequence (a function of the number of key bits
-     *   specified and the targeted device architecture).
-     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
-     * - \devicestorageP
-     * - \devicestorage
-     *
-     * \par Performance
-     * The following charts illustrate saturated sorting performance across different
-     * CUDA architectures for uniform-random \p uint32 and \p uint64 keys, respectively.
-     *
-     * \image html lsb_radix_sort_int32_keys.png
-     * \image html lsb_radix_sort_int64_keys.png
-     *
-     * \par Snippet
-     * The code snippet below illustrates the sorting of a device vector of \p int keys.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for sorting data
-     * int  num_items;          // e.g., 7
-     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_key_alt_buf;     // e.g., [        ...        ]
-     * ...
-     *
-     * // Create a DoubleBuffer to wrap the pair of device pointers
-     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sorting operation
-     * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items);
-     *
-     * // d_keys.Current()      <-- [0, 3, 5, 6, 7, 8, 9]
-     *
-     * \endcode
-     *
-     * \tparam KeyT      <b>[inferred]</b> KeyT type
-     */
-    template <typename KeyT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t SortKeys(
-        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        DoubleBuffer<KeyT>  &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
-        int                 num_items,                              ///< [in] Number of items to sort
-        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
-        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
-        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        // Null value type
-        DoubleBuffer<NullType> d_values;
-
-        return DispatchRadixSort<false, KeyT, NullType, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys,
-            d_values,
-            num_items,
-            begin_bit,
-            end_bit,
-            true,
-            stream,
-            debug_synchronous);
-    }
-
-    /**
-     * \brief Sorts keys into descending order. (~<em>2N</em> auxiliary storage required).
-     *
-     * \par
-     * - The contents of the input data are not altered by the sorting operation
-     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
-     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
-     * - \devicestorage
-     *
-     * \par Performance
-     * Performance is similar to DeviceRadixSort::SortKeys.
-     *
-     * \par Snippet
-     * The code snippet below illustrates the sorting of a device vector of \p int keys.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for sorting data
-     * int  num_items;          // e.g., 7
-     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_keys_out;        // e.g., [        ...        ]
-     * ...
-     *
-     * // Create a DoubleBuffer to wrap the pair of device pointers
-     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sorting operation
-     * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
-     *
-     * // d_keys_out            <-- [9, 8, 7, 6, 5, 3, 0]s
-     *
-     * \endcode
-     *
-     * \tparam KeyT      <b>[inferred]</b> KeyT type
-     */
-    template <typename KeyT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t SortKeysDescending(
-        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        const KeyT          *d_keys_in,                             ///< [in] Pointer to the input data of key data to sort
-        KeyT                *d_keys_out,                            ///< [out] Pointer to the sorted output sequence of key data
-        int                 num_items,                              ///< [in] Number of items to sort
-        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
-        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
-        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        DoubleBuffer<KeyT>      d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
-        DoubleBuffer<NullType>  d_values;
-
-        return DispatchRadixSort<true, KeyT, NullType, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys,
-            d_values,
-            num_items,
-            begin_bit,
-            end_bit,
-            false,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Sorts keys into descending order. (~<em>N </em>auxiliary storage required).
-     *
-     * \par
-     * - The sorting operation is given a pair of key buffers managed by a
-     *   DoubleBuffer structure that indicates which of the two buffers is
-     *   "current" (and thus contains the input data to be sorted).
-     * - The contents of both buffers may be altered by the sorting operation.
-     * - Upon completion, the sorting operation will update the "current" indicator
-     *   within the DoubleBuffer wrapper to reference which of the two buffers
-     *   now contains the sorted output sequence (a function of the number of key bits
-     *   specified and the targeted device architecture).
-     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
-     * - \devicestorageP
-     * - \devicestorage
-     *
-     * \par Performance
-     * Performance is similar to DeviceRadixSort::SortKeys.
-     *
-     * \par Snippet
-     * The code snippet below illustrates the sorting of a device vector of \p int keys.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for sorting data
-     * int  num_items;          // e.g., 7
-     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_key_alt_buf;     // e.g., [        ...        ]
-     * ...
-     *
-     * // Create a DoubleBuffer to wrap the pair of device pointers
-     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sorting operation
-     * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys, num_items);
-     *
-     * // d_keys.Current()      <-- [9, 8, 7, 6, 5, 3, 0]
-     *
-     * \endcode
-     *
-     * \tparam KeyT      <b>[inferred]</b> KeyT type
-     */
-    template <typename KeyT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t SortKeysDescending(
-        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        DoubleBuffer<KeyT>  &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
-        int                 num_items,                              ///< [in] Number of items to sort
-        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
-        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
-        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        // Null value type
-        DoubleBuffer<NullType> d_values;
-
-        return DispatchRadixSort<true, KeyT, NullType, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys,
-            d_values,
-            num_items,
-            begin_bit,
-            end_bit,
-            true,
-            stream,
-            debug_synchronous);
-    }
-
-
-    //@}  end member group
-
-
-};
-
-/**
- * \example example_device_radix_sort.cu
- */
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/thrust/system/cuda/detail/cub/device/device_reduce.cuh b/thrust/system/cuda/detail/cub/device/device_reduce.cuh
deleted file mode 100644
index 43b91f799..000000000
--- a/thrust/system/cuda/detail/cub/device/device_reduce.cuh
+++ /dev/null
@@ -1,734 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data items residing within device-accessible memory.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-#include <limits>
-
-#include "../iterator/arg_index_input_iterator.cuh"
-#include "dispatch/dispatch_reduce.cuh"
-#include "dispatch/dispatch_reduce_by_key.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \brief DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data items residing within device-accessible memory. ![](reduce_logo.png)
- * \ingroup SingleModule
- *
- * \par Overview
- * A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
- * uses a binary combining operator to compute a single aggregate from a sequence of input elements.
- *
- * \par Usage Considerations
- * \cdp_class{DeviceReduce}
- *
- * \par Performance
- * \linear_performance{reduction, reduce-by-key, and run-length encode}
- *
- * \par
- * The following chart illustrates DeviceReduce::Sum
- * performance across different CUDA architectures for \p int32 keys.
- *
- * \image html reduce_int32.png
- *
- * \par
- * The following chart illustrates DeviceReduce::ReduceByKey (summation)
- * performance across different CUDA architectures for \p fp32
- * values.  Segments are identified by \p int32 keys, and have lengths uniformly sampled from [1,1000].
- *
- * \image html reduce_by_key_fp32_len_500.png
- *
- * \par
- * \plots_below
- *
- */
-struct DeviceReduce
-{
-    /**
-     * \brief Computes a device-wide reduction using the specified binary \p reduction_op functor and initial value \p init.
-     *
-     * \par
-     * - Does not support binary reduction operators that are non-commutative.
-     * - Provides "run-to-run" determinism for pseudo-associative reduction
-     *   (e.g., addition of floating point types) on the same GPU device.
-     *   However, results for pseudo-associative reduction may be inconsistent
-     *   from one device to a another device of a different compute-capability
-     *   because CUB can employ different tile-sizing for different architectures.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates a user-defined min-reduction of a device vector of \p int data elements.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // CustomMin functor
-     * struct CustomMin
-     * {
-     *     template <typename T>
-     *     __device__ __forceinline__
-     *     T operator()(const T &a, const T &b) const {
-     *         return (b < a) ? b : a;
-     *     }
-     * };
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int          num_items;  // e.g., 7
-     * int          *d_in;      // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int          *d_out;     // e.g., [-]
-     * CustomMin    min_op;
-     * int          init;       // e.g., INT_MAX
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, min_op, init);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run reduction
-     * cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, min_op, init);
-     *
-     * // d_out <-- [0]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
-     * \tparam OutputIteratorT      <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
-     * \tparam ReductionOpT         <b>[inferred]</b> Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt> 
-     * \tparam T                    <b>[inferred]</b> Data element type that is convertible to the \p value type of \p InputIteratorT
-     */
-    template <
-        typename                    InputIteratorT,
-        typename                    OutputIteratorT,
-        typename                    ReductionOpT,
-        typename                    T>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t Reduce(
-        void                        *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
-        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
-        ReductionOpT                reduction_op,                       ///< [in] Binary reduction functor
-        T                           init,                               ///< [in] Initial value of the reduction
-        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, ReductionOpT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_out,
-            num_items,
-            reduction_op,
-            init,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Computes a device-wide sum using the addition (\p +) operator.
-     *
-     * \par
-     * - Uses \p 0 as the initial value of the reduction.
-     * - Does not support \p + operators that are non-commutative..
-     * - Provides "run-to-run" determinism for pseudo-associative reduction
-     *   (e.g., addition of floating point types) on the same GPU device.
-     *   However, results for pseudo-associative reduction may be inconsistent
-     *   from one device to a another device of a different compute-capability
-     *   because CUB can employ different tile-sizing for different architectures.
-     * - \devicestorage
-     *
-     * \par Performance
-     * The following charts illustrate saturated sum-reduction performance across different
-     * CUDA architectures for \p int32 and \p int64 items, respectively.
-     *
-     * \image html reduce_int32.png
-     * \image html reduce_int64.png
-     *
-     * \par Snippet
-     * The code snippet below illustrates the sum-reduction of a device vector of \p int data elements.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int  num_items;      // e.g., 7
-     * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_out;         // e.g., [-]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sum-reduction
-     * cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
-     *
-     * // d_out <-- [38]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
-     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
-     */
-    template <
-        typename                    InputIteratorT,
-        typename                    OutputIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t Sum(
-        void                        *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
-        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
-        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        // The output value type
-        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-            typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
-            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
-
-        return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, cub::Sum>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_out,
-            num_items,
-            cub::Sum(),
-            OutputT(),            // zero-initialize
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Computes a device-wide minimum using the less-than ('<') operator.
-     *
-     * \par
-     * - Uses <tt>std::numeric_limits<T>::max()</tt> as the initial value of the reduction.
-     * - Does not support \p < operators that are non-commutative.
-     * - Provides "run-to-run" determinism for pseudo-associative reduction
-     *   (e.g., addition of floating point types) on the same GPU device.
-     *   However, results for pseudo-associative reduction may be inconsistent
-     *   from one device to a another device of a different compute-capability
-     *   because CUB can employ different tile-sizing for different architectures.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the min-reduction of a device vector of \p int data elements.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int  num_items;      // e.g., 7
-     * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_out;         // e.g., [-]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run min-reduction
-     * cub::DeviceReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
-     *
-     * // d_out <-- [0]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
-     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
-     */
-    template <
-        typename                    InputIteratorT,
-        typename                    OutputIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t Min(
-        void                        *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
-        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
-        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        // The input value type
-        typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
-
-        return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, cub::Min>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_out,
-            num_items,
-            cub::Min(),
-            Traits<InputT>::Max(), // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Finds the first device-wide minimum using the less-than ('<') operator, also returning the index of that item.
-     *
-     * \par
-     * - The output value type of \p d_out is cub::KeyValuePair <tt><int, T></tt> (assuming the value type of \p d_in is \p T)
-     *   - The minimum is written to <tt>d_out.value</tt> and its offset in the input array is written to <tt>d_out.key</tt>.
-     *   - The <tt>{1, std::numeric_limits<T>::max()}</tt> tuple is produced for zero-length inputs
-     * - Does not support \p < operators that are non-commutative.
-     * - Provides "run-to-run" determinism for pseudo-associative reduction
-     *   (e.g., addition of floating point types) on the same GPU device.
-     *   However, results for pseudo-associative reduction may be inconsistent
-     *   from one device to a another device of a different compute-capability
-     *   because CUB can employ different tile-sizing for different architectures.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the argmin-reduction of a device vector of \p int data elements.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int                      num_items;      // e.g., 7
-     * int                      *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * KeyValuePair<int, int>   *d_out;         // e.g., [{-,-}]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_argmin, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run argmin-reduction
-     * cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_argmin, num_items);
-     *
-     * // d_out <-- [{5, 0}]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items (of some type \p T) \iterator
-     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate (having value type <tt>cub::KeyValuePair<int, T></tt>) \iterator
-     */
-    template <
-        typename                    InputIteratorT,
-        typename                    OutputIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t ArgMin(
-        void                        *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
-        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
-        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        // The input type
-        typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;
-
-        // The output tuple type
-        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-            KeyValuePair<OffsetT, InputValueT>,                                                                 // ... then the key value pair OffsetT + InputValueT
-            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputTupleT;                     // ... else the output iterator's value type
-
-        // The output value type
-        typedef typename OutputTupleT::Value OutputValueT;
-
-        // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
-        typedef ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT> ArgIndexInputIteratorT;
-        ArgIndexInputIteratorT d_indexed_in(d_in);
-
-        // Initial value
-        OutputTupleT initial_value(1, Traits<InputValueT>::Max());   // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
-
-        return DispatchReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMin>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_indexed_in,
-            d_out,
-            num_items,
-            cub::ArgMin(),
-            initial_value,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Computes a device-wide maximum using the greater-than ('>') operator.
-     *
-     * \par
-     * - Uses <tt>std::numeric_limits<T>::lowest()</tt> as the initial value of the reduction.
-     * - Does not support \p > operators that are non-commutative.
-     * - Provides "run-to-run" determinism for pseudo-associative reduction
-     *   (e.g., addition of floating point types) on the same GPU device.
-     *   However, results for pseudo-associative reduction may be inconsistent
-     *   from one device to a another device of a different compute-capability
-     *   because CUB can employ different tile-sizing for different architectures.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the max-reduction of a device vector of \p int data elements.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int  num_items;      // e.g., 7
-     * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_out;         // e.g., [-]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_max, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run max-reduction
-     * cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_max, num_items);
-     *
-     * // d_out <-- [9]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
-     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
-     */
-    template <
-        typename                    InputIteratorT,
-        typename                    OutputIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t Max(
-        void                        *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
-        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
-        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        // The input value type
-        typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
-
-        return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, cub::Max>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_out,
-            num_items,
-            cub::Max(),
-            Traits<InputT>::Lowest(),    // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Finds the first device-wide maximum using the greater-than ('>') operator, also returning the index of that item
-     *
-     * \par
-     * - The output value type of \p d_out is cub::KeyValuePair <tt><int, T></tt> (assuming the value type of \p d_in is \p T)
-     *   - The maximum is written to <tt>d_out.value</tt> and its offset in the input array is written to <tt>d_out.key</tt>.
-     *   - The <tt>{1, std::numeric_limits<T>::lowest()}</tt> tuple is produced for zero-length inputs
-     * - Does not support \p > operators that are non-commutative.
-     * - Provides "run-to-run" determinism for pseudo-associative reduction
-     *   (e.g., addition of floating point types) on the same GPU device.
-     *   However, results for pseudo-associative reduction may be inconsistent
-     *   from one device to a another device of a different compute-capability
-     *   because CUB can employ different tile-sizing for different architectures.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the argmax-reduction of a device vector of \p int data elements.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_reduce.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int                      num_items;      // e.g., 7
-     * int                      *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * KeyValuePair<int, int>   *d_out;         // e.g., [{-,-}]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_argmax, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run argmax-reduction
-     * cub::DeviceReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_argmax, num_items);
-     *
-     * // d_out <-- [{6, 9}]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items (of some type \p T) \iterator
-     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate (having value type <tt>cub::KeyValuePair<int, T></tt>) \iterator
-     */
-    template <
-        typename                    InputIteratorT,
-        typename                    OutputIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t ArgMax(
-        void                        *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
-        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
-        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        // The input type
-        typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;
-
-        // The output tuple type
-        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-            KeyValuePair<OffsetT, InputValueT>,                                                                 // ... then the key value pair OffsetT + InputValueT
-            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputTupleT;                     // ... else the output iterator's value type
-
-        // The output value type
-        typedef typename OutputTupleT::Value OutputValueT;
-
-        // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
-        typedef ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT> ArgIndexInputIteratorT;
-        ArgIndexInputIteratorT d_indexed_in(d_in);
-
-        // Initial value
-        OutputTupleT initial_value(1, Traits<InputValueT>::Lowest());     // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
-
-        return DispatchReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMax>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_indexed_in,
-            d_out,
-            num_items,
-            cub::ArgMax(),
-            initial_value,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Reduces segments of values, where segments are demarcated by corresponding runs of identical keys.
-     *
-     * \par
-     * This operation computes segmented reductions within \p d_values_in using
-     * the specified binary \p reduction_op functor.  The segments are identified by
-     * "runs" of corresponding keys in \p d_keys_in, where runs are maximal ranges of
-     * consecutive, identical keys.  For the <em>i</em><sup>th</sup> run encountered,
-     * the first key of the run and the corresponding value aggregate of that run are
-     * written to <tt>d_unique_out[<em>i</em>]</tt> and <tt>d_aggregates_out[<em>i</em>]</tt>,
-     * respectively. The total number of runs encountered is written to \p d_num_runs_out.
-     *
-     * \par
-     * - The <tt>==</tt> equality operator is used to determine whether keys are equivalent
-     * - Provides "run-to-run" determinism for pseudo-associative reduction
-     *   (e.g., addition of floating point types) on the same GPU device.
-     *   However, results for pseudo-associative reduction may be inconsistent
-     *   from one device to a another device of a different compute-capability
-     *   because CUB can employ different tile-sizing for different architectures.
-     * - \devicestorage
-     *
-     * \par Performance
-     * The following chart illustrates reduction-by-key (sum) performance across
-     * different CUDA architectures for \p fp32 and \p fp64 values, respectively.  Segments
-     * are identified by \p int32 keys, and have lengths uniformly sampled from [1,1000].
-     *
-     * \image html reduce_by_key_fp32_len_500.png
-     * \image html reduce_by_key_fp64_len_500.png
-     *
-     * \par
-     * The following charts are similar, but with segment lengths uniformly sampled from [1,10]:
-     *
-     * \image html reduce_by_key_fp32_len_5.png
-     * \image html reduce_by_key_fp64_len_5.png
-     *
-     * \par Snippet
-     * The code snippet below illustrates the segmented reduction of \p int values grouped
-     * by runs of associated \p int keys.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_reduce.cuh>
-     *
-     * // CustomMin functor
-     * struct CustomMin
-     * {
-     *     template <typename T>
-     *     CUB_RUNTIME_FUNCTION __forceinline__
-     *     T operator()(const T &a, const T &b) const {
-     *         return (b < a) ? b : a;
-     *     }
-     * };
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int          num_items;          // e.g., 8
-     * int          *d_keys_in;         // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
-     * int          *d_values_in;       // e.g., [0, 7, 1, 6, 2, 5, 3, 4]
-     * int          *d_unique_out;      // e.g., [-, -, -, -, -, -, -, -]
-     * int          *d_aggregates_out;  // e.g., [-, -, -, -, -, -, -, -]
-     * int          *d_num_runs_out;    // e.g., [-]
-     * CustomMin    reduction_op;
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceReduce::ReduceByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_unique_out, d_values_in, d_aggregates_out, d_num_runs_out, reduction_op, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run reduce-by-key
-     * cub::DeviceReduce::ReduceByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_unique_out, d_values_in, d_aggregates_out, d_num_runs_out, reduction_op, num_items);
-     *
-     * // d_unique_out      <-- [0, 2, 9, 5, 8]
-     * // d_aggregates_out  <-- [0, 1, 6, 2, 4]
-     * // d_num_runs_out    <-- [5]
-     *
-     * \endcode
-     *
-     * \tparam KeysInputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input keys \iterator
-     * \tparam UniqueOutputIteratorT    <b>[inferred]</b> Random-access output iterator type for writing unique output keys \iterator
-     * \tparam ValuesInputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input values \iterator
-     * \tparam AggregatesOutputIterator <b>[inferred]</b> Random-access output iterator type for writing output value aggregates \iterator
-     * \tparam NumRunsOutputIteratorT   <b>[inferred]</b> Output iterator type for recording the number of runs encountered \iterator
-     * \tparam ReductionOpT              <b>[inferred]</b> Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt> 
-     */
-    template <
-        typename                    KeysInputIteratorT,
-        typename                    UniqueOutputIteratorT,
-        typename                    ValuesInputIteratorT,
-        typename                    AggregatesOutputIteratorT,
-        typename                    NumRunsOutputIteratorT,
-        typename                    ReductionOpT>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t ReduceByKey(
-        void                        *d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        KeysInputIteratorT          d_keys_in,                      ///< [in] Pointer to the input sequence of keys
-        UniqueOutputIteratorT       d_unique_out,                   ///< [out] Pointer to the output sequence of unique keys (one key per run)
-        ValuesInputIteratorT        d_values_in,                    ///< [in] Pointer to the input sequence of corresponding values
-        AggregatesOutputIteratorT   d_aggregates_out,               ///< [out] Pointer to the output sequence of value aggregates (one aggregate per run)
-        NumRunsOutputIteratorT      d_num_runs_out,                 ///< [out] Pointer to total number of runs encountered (i.e., the length of d_unique_out)
-        ReductionOpT                reduction_op,                   ///< [in] Binary reduction functor
-        int                         num_items,                      ///< [in] Total number of associated key+value pairs (i.e., the length of \p d_in_keys and \p d_in_values)
-        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        // FlagT iterator type (not used)
-
-        // Selection op (not used)
-
-        // Default == operator
-        typedef Equality EqualityOp;
-
-        return DispatchReduceByKey<KeysInputIteratorT, UniqueOutputIteratorT, ValuesInputIteratorT, AggregatesOutputIteratorT, NumRunsOutputIteratorT, EqualityOp, ReductionOpT, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys_in,
-            d_unique_out,
-            d_values_in,
-            d_aggregates_out,
-            d_num_runs_out,
-            EqualityOp(),
-            reduction_op,
-            num_items,
-            stream,
-            debug_synchronous);
-    }
-
-};
-
-/**
- * \example example_device_reduce.cu
- */
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/thrust/system/cuda/detail/cub/device/device_run_length_encode.cuh b/thrust/system/cuda/detail/cub/device/device_run_length_encode.cuh
deleted file mode 100644
index 236926c71..000000000
--- a/thrust/system/cuda/detail/cub/device/device_run_length_encode.cuh
+++ /dev/null
@@ -1,278 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DeviceRunLengthEncode provides device-wide, parallel operations for computing a run-length encoding across a sequence of data items residing within device-accessible memory.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-
-#include "dispatch/dispatch_rle.cuh"
-#include "dispatch/dispatch_reduce_by_key.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \brief DeviceRunLengthEncode provides device-wide, parallel operations for demarcating "runs" of same-valued items within a sequence residing within device-accessible memory. ![](run_length_encode_logo.png)
- * \ingroup SingleModule
- *
- * \par Overview
- * A <a href="http://en.wikipedia.org/wiki/Run-length_encoding"><em>run-length encoding</em></a>
- * computes a simple compressed representation of a sequence of input elements such that each
- * maximal "run" of consecutive same-valued data items is encoded as a single data value along with a
- * count of the elements in that run.
- *
- * \par Usage Considerations
- * \cdp_class{DeviceRunLengthEncode}
- *
- * \par Performance
- * \linear_performance{run-length encode}
- *
- * \par
- * The following chart illustrates DeviceRunLengthEncode::RunLengthEncode performance across
- * different CUDA architectures for \p int32 items.
- * Segments have lengths uniformly sampled from [1,1000].
- *
- * \image html rle_int32_len_500.png
- *
- * \par
- * \plots_below
- *
- */
-struct DeviceRunLengthEncode
-{
-
-    /**
-     * \brief Computes a run-length encoding of the sequence \p d_in.
-     *
-     * \par
-     * - For the <em>i</em><sup>th</sup> run encountered, the first key of the run and its length are written to
-     *   <tt>d_unique_out[<em>i</em>]</tt> and <tt>d_counts_out[<em>i</em>]</tt>,
-     *   respectively.
-     * - The total number of runs encountered is written to \p d_num_runs_out.
-     * - The <tt>==</tt> equality operator is used to determine whether values are equivalent
-     * - \devicestorage
-     *
-     * \par Performance
-     * The following charts illustrate saturated encode performance across different
-     * CUDA architectures for \p int32 and \p int64 items, respectively.  Segments have
-     * lengths uniformly sampled from [1,1000].
-     *
-     * \image html rle_int32_len_500.png
-     * \image html rle_int64_len_500.png
-     *
-     * \par
-     * The following charts are similar, but with segment lengths uniformly sampled from [1,10]:
-     *
-     * \image html rle_int32_len_5.png
-     * \image html rle_int64_len_5.png
-     *
-     * \par Snippet
-     * The code snippet below illustrates the run-length encoding of a sequence of \p int values.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_run_length_encode.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int          num_items;          // e.g., 8
-     * int          *d_in;              // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
-     * int          *d_unique_out;      // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
-     * int          *d_counts_out;      // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
-     * int          *d_num_runs_out;    // e.g., [ ]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceRunLengthEncode::Encode(d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run encoding
-     * cub::DeviceRunLengthEncode::Encode(d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items);
-     *
-     * // d_unique_out      <-- [0, 2, 9, 5, 8]
-     * // d_counts_out      <-- [1, 2, 1, 3, 1]
-     * // d_num_runs_out    <-- [5]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT           <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
-     * \tparam UniqueOutputIteratorT    <b>[inferred]</b> Random-access output iterator type for writing unique output items \iterator
-     * \tparam LengthsOutputIteratorT   <b>[inferred]</b> Random-access output iterator type for writing output counts \iterator
-     * \tparam NumRunsOutputIteratorT   <b>[inferred]</b> Output iterator type for recording the number of runs encountered \iterator
-     */
-    template <
-        typename                    InputIteratorT,
-        typename                    UniqueOutputIteratorT,
-        typename                    LengthsOutputIteratorT,
-        typename                    NumRunsOutputIteratorT>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Encode(
-        void*                       d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of keys
-        UniqueOutputIteratorT       d_unique_out,                   ///< [out] Pointer to the output sequence of unique keys (one key per run)
-        LengthsOutputIteratorT      d_counts_out,                   ///< [out] Pointer to the output sequence of run-lengths (one count per run)
-        NumRunsOutputIteratorT      d_num_runs_out,                     ///< [out] Pointer to total number of runs
-        int                         num_items,                      ///< [in] Total number of associated key+value pairs (i.e., the length of \p d_in_keys and \p d_in_values)
-        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        typedef int         OffsetT;                    // Signed integer type for global offsets
-        typedef NullType*   FlagIterator;               // FlagT iterator type (not used)
-        typedef NullType    SelectOp;                   // Selection op (not used)
-        typedef Equality    EqualityOp;                 // Default == operator
-        typedef cub::Sum    ReductionOp;                // Value reduction operator
-
-        // The lengths output value type
-        typedef typename If<(Equals<typename std::iterator_traits<LengthsOutputIteratorT>::value_type, void>::VALUE),   // LengthT =  (if output iterator's value type is void) ?
-            OffsetT,                                                                                                    // ... then the OffsetT type,
-            typename std::iterator_traits<LengthsOutputIteratorT>::value_type>::Type LengthT;                           // ... else the output iterator's value type
-
-        // Generator type for providing 1s values for run-length reduction
-        typedef ConstantInputIterator<LengthT, OffsetT> LengthsInputIteratorT;
-
-        return DispatchReduceByKey<InputIteratorT, UniqueOutputIteratorT, LengthsInputIteratorT, LengthsOutputIteratorT, NumRunsOutputIteratorT, EqualityOp, ReductionOp, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_unique_out,
-            LengthsInputIteratorT((LengthT) 1),
-            d_counts_out,
-            d_num_runs_out,
-            EqualityOp(),
-            ReductionOp(),
-            num_items,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Enumerates the starting offsets and lengths of all non-trivial runs (of length > 1) of same-valued keys in the sequence \p d_in.
-     *
-     * \par
-     * - For the <em>i</em><sup>th</sup> non-trivial run, the run's starting offset
-     *   and its length are written to <tt>d_offsets_out[<em>i</em>]</tt> and
-     *   <tt>d_lengths_out[<em>i</em>]</tt>, respectively.
-     * - The total number of runs encountered is written to \p d_num_runs_out.
-     * - The <tt>==</tt> equality operator is used to determine whether values are equivalent
-     * - \devicestorage
-     *
-     * \par Performance
-     *
-     * \par Snippet
-     * The code snippet below illustrates the identification of non-trivial runs within a sequence of \p int values.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_run_length_encode.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int          num_items;          // e.g., 8
-     * int          *d_in;              // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
-     * int          *d_offsets_out;     // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
-     * int          *d_lengths_out;     // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
-     * int          *d_num_runs_out;    // e.g., [ ]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceRunLengthEncode::NonTrivialRuns(d_temp_storage, temp_storage_bytes, d_in, d_offsets_out, d_lengths_out, d_num_runs_out, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run encoding
-     * cub::DeviceRunLengthEncode::NonTrivialRuns(d_temp_storage, temp_storage_bytes, d_in, d_offsets_out, d_lengths_out, d_num_runs_out, num_items);
-     *
-     * // d_offsets_out         <-- [1, 4]
-     * // d_lengths_out         <-- [2, 3]
-     * // d_num_runs_out        <-- [2]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT           <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
-     * \tparam OffsetsOutputIteratorT   <b>[inferred]</b> Random-access output iterator type for writing run-offset values \iterator
-     * \tparam LengthsOutputIteratorT   <b>[inferred]</b> Random-access output iterator type for writing run-length values \iterator
-     * \tparam NumRunsOutputIteratorT   <b>[inferred]</b> Output iterator type for recording the number of runs encountered \iterator
-     */
-    template <
-        typename                InputIteratorT,
-        typename                OffsetsOutputIteratorT,
-        typename                LengthsOutputIteratorT,
-        typename                NumRunsOutputIteratorT>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t NonTrivialRuns(
-        void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                  &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT          d_in,                           ///< [in] Pointer to input sequence of data items
-        OffsetsOutputIteratorT  d_offsets_out,                  ///< [out] Pointer to output sequence of run-offsets (one offset per non-trivial run)
-        LengthsOutputIteratorT  d_lengths_out,                  ///< [out] Pointer to output sequence of run-lengths (one count per non-trivial run)
-        NumRunsOutputIteratorT  d_num_runs_out,                 ///< [out] Pointer to total number of runs (i.e., length of \p d_offsets_out)
-        int                     num_items,                      ///< [in] Total number of associated key+value pairs (i.e., the length of \p d_in_keys and \p d_in_values)
-        cudaStream_t            stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                    debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        typedef int         OffsetT;                    // Signed integer type for global offsets
-        typedef Equality    EqualityOp;                 // Default == operator
-
-        return DeviceRleDispatch<InputIteratorT, OffsetsOutputIteratorT, LengthsOutputIteratorT, NumRunsOutputIteratorT, EqualityOp, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_offsets_out,
-            d_lengths_out,
-            d_num_runs_out,
-            EqualityOp(),
-            num_items,
-            stream,
-            debug_synchronous);
-    }
-
-
-};
-
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/thrust/system/cuda/detail/cub/device/device_scan.cuh b/thrust/system/cuda/detail/cub/device/device_scan.cuh
deleted file mode 100644
index 91827f230..000000000
--- a/thrust/system/cuda/detail/cub/device/device_scan.cuh
+++ /dev/null
@@ -1,443 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data items residing within device-accessible memory.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-
-#include "dispatch/dispatch_scan.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \brief DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data items residing within device-accessible memory. ![](device_scan.png)
- * \ingroup SingleModule
- *
- * \par Overview
- * Given a sequence of input elements and a binary reduction operator, a [<em>prefix scan</em>](http://en.wikipedia.org/wiki/Prefix_sum)
- * produces an output sequence where each element is computed to be the reduction
- * of the elements occurring earlier in the input sequence.  <em>Prefix sum</em>
- * connotes a prefix scan with the addition operator. The term \em inclusive indicates
- * that the <em>i</em><sup>th</sup> output reduction incorporates the <em>i</em><sup>th</sup> input.
- * The term \em exclusive indicates the <em>i</em><sup>th</sup> input is not incorporated into
- * the <em>i</em><sup>th</sup> output reduction.
- *
- * \par
- * As of CUB 1.0.1 (2013), CUB's device-wide scan APIs have implemented our <em>"decoupled look-back"</em> algorithm
- * for performing global prefix scan with only a single pass through the
- * input data, as described in our 2016 technical report [1].  The central
- * idea is to leverage a small, constant factor of redundant work in order to overlap the latencies
- * of global prefix propagation with local computation.  As such, our algorithm requires only
- * ~2<em>n</em> data movement (<em>n</em> inputs are read, <em>n</em> outputs are written), and typically
- * proceeds at "memcpy" speeds.
- *
- * \par
- * [1] [Duane Merrill and Michael Garland.  "Single-pass Parallel Prefix Scan with Decoupled Look-back", <em>NVIDIA Technical Report NVR-2016-002</em>, 2016.](https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back)
- *
- * \par Usage Considerations
- * \cdp_class{DeviceScan}
- *
- * \par Performance
- * \linear_performance{prefix scan}
- *
- * \par
- * The following chart illustrates DeviceScan::ExclusiveSum
- * performance across different CUDA architectures for \p int32 keys.
- * \plots_below
- *
- * \image html scan_int32.png
- *
- */
-struct DeviceScan
-{
-    /******************************************************************//**
-     * \name Exclusive scans
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Computes a device-wide exclusive prefix sum.  The value of 0 is applied as the initial value, and is assigned to *d_out.
-     *
-     * \par
-     * - Supports non-commutative sum operators.
-     * - Provides "run-to-run" determinism for pseudo-associative reduction
-     *   (e.g., addition of floating point types) on the same GPU device.
-     *   However, results for pseudo-associative reduction may be inconsistent
-     *   from one device to a another device of a different compute-capability
-     *   because CUB can employ different tile-sizing for different architectures.
-     * - \devicestorage
-     *
-     * \par Performance
-     * The following charts illustrate saturated exclusive sum performance across different
-     * CUDA architectures for \p int32 and \p int64 items, respectively.
-     *
-     * \image html scan_int32.png
-     * \image html scan_int64.png
-     *
-     * \par Snippet
-     * The code snippet below illustrates the exclusive prefix sum of an \p int device vector.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int  num_items;      // e.g., 7
-     * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_out;         // e.g., [ ,  ,  ,  ,  ,  ,  ]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run exclusive prefix sum
-     * cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
-     *
-     * // d_out s<-- [0, 8, 14, 21, 26, 29, 29]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading scan inputs \iterator
-     * \tparam OutputIteratorT    <b>[inferred]</b> Random-access output iterator type for writing scan outputs \iterator
-     */
-    template <
-        typename        InputIteratorT,
-        typename        OutputIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t ExclusiveSum(
-        void            *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t          &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT  d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT d_out,                              ///< [out] Pointer to the output sequence of data items
-        int             num_items,                          ///< [in] Total number of input items (i.e., the length of \p d_in)
-        cudaStream_t    stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool            debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        // The output value type
-        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-            typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
-            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
-
-        // Initial value
-        OutputT init_value = 0;
-
-        return DispatchScan<InputIteratorT, OutputIteratorT, Sum, OutputT, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_out,
-            Sum(),
-            init_value,
-            num_items,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Computes a device-wide exclusive prefix scan using the specified binary \p scan_op functor.  The \p init_value value is applied as the initial value, and is assigned to *d_out.
-     *
-     * \par
-     * - Supports non-commutative scan operators.
-     * - Provides "run-to-run" determinism for pseudo-associative reduction
-     *   (e.g., addition of floating point types) on the same GPU device.
-     *   However, results for pseudo-associative reduction may be inconsistent
-     *   from one device to a another device of a different compute-capability
-     *   because CUB can employ different tile-sizing for different architectures.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the exclusive prefix min-scan of an \p int device vector
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
-     *
-     * // CustomMin functor
-     * struct CustomMin
-     * {
-     *     template <typename T>
-     *     CUB_RUNTIME_FUNCTION __forceinline__
-     *     T operator()(const T &a, const T &b) const {
-     *         return (b < a) ? b : a;
-     *     }
-     * };
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int          num_items;      // e.g., 7
-     * int          *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int          *d_out;         // e.g., [ ,  ,  ,  ,  ,  ,  ]
-     * CustomMin    min_op
-     * ...
-     *
-     * // Determine temporary device storage requirements for exclusive prefix scan
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, (int) MAX_INT, num_items);
-     *
-     * // Allocate temporary storage for exclusive prefix scan
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run exclusive prefix min-scan
-     * cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, (int) MAX_INT, num_items);
-     *
-     * // d_out <-- [2147483647, 8, 6, 6, 5, 3, 0]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT   <b>[inferred]</b> Random-access input iterator type for reading scan inputs \iterator
-     * \tparam OutputIteratorT  <b>[inferred]</b> Random-access output iterator type for writing scan outputs \iterator
-     * \tparam ScanOp           <b>[inferred]</b> Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
-     * \tparam Identity         <b>[inferred]</b> Type of the \p identity value used Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <
-        typename        InputIteratorT,
-        typename        OutputIteratorT,
-        typename        ScanOpT,
-        typename        InitValueT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t ExclusiveScan(
-        void            *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t          &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT  d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT d_out,                              ///< [out] Pointer to the output sequence of data items
-        ScanOpT         scan_op,                            ///< [in] Binary scan functor
-        InitValueT      init_value,                         ///< [in] Initial value to seed the exclusive scan (and is assigned to *d_out)
-        int             num_items,                          ///< [in] Total number of input items (i.e., the length of \p d_in)
-        cudaStream_t    stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool            debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, InitValueT, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_out,
-            scan_op,
-            init_value,
-            num_items,
-            stream,
-            debug_synchronous);
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Inclusive scans
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes a device-wide inclusive prefix sum.
-     *
-     * \par
-     * - Supports non-commutative sum operators.
-     * - Provides "run-to-run" determinism for pseudo-associative reduction
-     *   (e.g., addition of floating point types) on the same GPU device.
-     *   However, results for pseudo-associative reduction may be inconsistent
-     *   from one device to a another device of a different compute-capability
-     *   because CUB can employ different tile-sizing for different architectures.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the inclusive prefix sum of an \p int device vector.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int  num_items;      // e.g., 7
-     * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_out;         // e.g., [ ,  ,  ,  ,  ,  ,  ]
-     * ...
-     *
-     * // Determine temporary device storage requirements for inclusive prefix sum
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
-     *
-     * // Allocate temporary storage for inclusive prefix sum
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run inclusive prefix sum
-     * cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
-     *
-     * // d_out <-- [8, 14, 21, 26, 29, 29, 38]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading scan inputs \iterator
-     * \tparam OutputIteratorT    <b>[inferred]</b> Random-access output iterator type for writing scan outputs \iterator
-     */
-    template <
-        typename            InputIteratorT,
-        typename            OutputIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t InclusiveSum(
-        void*               d_temp_storage,                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&             temp_storage_bytes,             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT      d_in,                           ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT     d_out,                          ///< [out] Pointer to the output sequence of data items
-        int                 num_items,                      ///< [in] Total number of input items (i.e., the length of \p d_in)
-        cudaStream_t        stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        return DispatchScan<InputIteratorT, OutputIteratorT, Sum, NullType, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_out,
-            Sum(),
-            NullType(),
-            num_items,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Computes a device-wide inclusive prefix scan using the specified binary \p scan_op functor.
-     *
-     * \par
-     * - Supports non-commutative scan operators.
-     * - Provides "run-to-run" determinism for pseudo-associative reduction
-     *   (e.g., addition of floating point types) on the same GPU device.
-     *   However, results for pseudo-associative reduction may be inconsistent
-     *   from one device to a another device of a different compute-capability
-     *   because CUB can employ different tile-sizing for different architectures.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the inclusive prefix min-scan of an \p int device vector.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
-     *
-     * // CustomMin functor
-     * struct CustomMin
-     * {
-     *     template <typename T>
-     *     CUB_RUNTIME_FUNCTION __forceinline__
-     *     T operator()(const T &a, const T &b) const {
-     *         return (b < a) ? b : a;
-     *     }
-     * };
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int          num_items;      // e.g., 7
-     * int          *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int          *d_out;         // e.g., [ ,  ,  ,  ,  ,  ,  ]
-     * CustomMin    min_op;
-     * ...
-     *
-     * // Determine temporary device storage requirements for inclusive prefix scan
-     * void *d_temp_storage = NULL;
-     * size_t temp_storage_bytes = 0;
-     * cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, num_items);
-     *
-     * // Allocate temporary storage for inclusive prefix scan
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run inclusive prefix min-scan
-     * cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, num_items);
-     *
-     * // d_out <-- [8, 6, 6, 5, 3, 0, 0]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT   <b>[inferred]</b> Random-access input iterator type for reading scan inputs \iterator
-     * \tparam OutputIteratorT  <b>[inferred]</b> Random-access output iterator type for writing scan outputs \iterator
-     * \tparam ScanOp           <b>[inferred]</b> Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <
-        typename        InputIteratorT,
-        typename        OutputIteratorT,
-        typename        ScanOpT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t InclusiveScan(
-        void            *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t          &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT  d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT d_out,                              ///< [out] Pointer to the output sequence of data items
-        ScanOpT         scan_op,                            ///< [in] Binary scan functor
-        int             num_items,                          ///< [in] Total number of input items (i.e., the length of \p d_in)
-        cudaStream_t    stream             = 0,             ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool            debug_synchronous  = false)         ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, NullType, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_out,
-            scan_op,
-            NullType(),
-            num_items,
-            stream,
-            debug_synchronous);
-    }
-
-    //@}  end member group
-
-};
-
-/**
- * \example example_device_scan.cu
- */
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/thrust/system/cuda/detail/cub/device/device_segmented_radix_sort.cuh b/thrust/system/cuda/detail/cub/device/device_segmented_radix_sort.cuh
deleted file mode 100644
index dc019331e..000000000
--- a/thrust/system/cuda/detail/cub/device/device_segmented_radix_sort.cuh
+++ /dev/null
@@ -1,876 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DeviceSegmentedRadixSort provides device-wide, parallel operations for computing a batched radix sort across multiple, non-overlapping sequences of data items residing within device-accessible memory.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-
-#include "dispatch/dispatch_radix_sort.cuh"
-#include "../util_arch.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \brief DeviceSegmentedRadixSort provides device-wide, parallel operations for computing a batched radix sort across multiple, non-overlapping sequences of data items residing within device-accessible memory. ![](segmented_sorting_logo.png)
- * \ingroup SegmentedModule
- *
- * \par Overview
- * The [<em>radix sorting method</em>](http://en.wikipedia.org/wiki/Radix_sort) arranges
- * items into ascending (or descending) order.  The algorithm relies upon a positional representation for
- * keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits,
- * characters, etc.) specified from least-significant to most-significant.  For a
- * given input sequence of keys and a set of rules specifying a total ordering
- * of the symbolic alphabet, the radix sorting method produces a lexicographic
- * ordering of those keys.
- *
- * \par
- * DeviceSegmentedRadixSort can sort all of the built-in C++ numeric primitive types
- * (<tt>unsigned char</tt>, \p int, \p double, etc.) as well as CUDA's \p __half
- * half-precision floating-point type.  Although the direct radix sorting
- * method can only be applied to unsigned integral types, DeviceSegmentedRadixSort
- * is able to sort signed and floating-point types via simple bit-wise transformations
- * that ensure lexicographic key ordering.
- *
- * \par Usage Considerations
- * \cdp_class{DeviceSegmentedRadixSort}
- *
- */
-struct DeviceSegmentedRadixSort
-{
-
-    /******************************************************************//**
-     * \name Key-value pairs
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Sorts segments of key-value pairs into ascending order. (~<em>2N </em>auxiliary storage required)
-     *
-     * \par
-     * - The contents of the input data are not altered by the sorting operation
-     * - When input a contiguous sequence of segments, a single sequence
-     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
-     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
-     *   the latter is specified as <tt>segment_offsets+1</tt>).
-     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
-     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys
-     * with associated vector of \p int values.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmentd_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for sorting data
-     * int  num_items;          // e.g., 7
-     * int  num_segments;       // e.g., 3
-     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
-     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
-     * int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
-     * int  *d_values_out;      // e.g., [-, -, -, -, -, -, -]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes,
-     *     d_keys_in, d_keys_out, d_values_in, d_values_out,
-     *     num_items, num_segments, d_offsets, d_offsets + 1);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sorting operation
-     * cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes,
-     *     d_keys_in, d_keys_out, d_values_in, d_values_out,
-     *     num_items, num_segments, d_offsets, d_offsets + 1);
-     *
-     * // d_keys_out            <-- [6, 7, 8, 0, 3, 5, 9]
-     * // d_values_out          <-- [1, 2, 0, 5, 4, 3, 6]
-     *
-     * \endcode
-     *
-     * \tparam KeyT             <b>[inferred]</b> Key type
-     * \tparam ValueT           <b>[inferred]</b> Value type
-     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
-     */
-    template <
-        typename            KeyT,
-        typename            ValueT,
-        typename            OffsetIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t SortPairs(
-        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        const KeyT          *d_keys_in,                             ///< [in] %Device-accessible pointer to the input data of key data to sort
-        KeyT                *d_keys_out,                            ///< [out] %Device-accessible pointer to the sorted output sequence of key data
-        const ValueT        *d_values_in,                           ///< [in] %Device-accessible pointer to the corresponding input sequence of associated value items
-        ValueT              *d_values_out,                          ///< [out] %Device-accessible pointer to the correspondingly-reordered output sequence of associated value items
-        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
-        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
-        OffsetIteratorT     d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-        OffsetIteratorT     d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
-        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
-        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        DoubleBuffer<KeyT>       d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
-        DoubleBuffer<ValueT>     d_values(const_cast<ValueT*>(d_values_in), d_values_out);
-
-        return DispatchSegmentedRadixSort<false, KeyT, ValueT, OffsetIteratorT, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys,
-            d_values,
-            num_items,
-            num_segments,
-            d_begin_offsets,
-            d_end_offsets,
-            begin_bit,
-            end_bit,
-            false,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Sorts segments of key-value pairs into ascending order. (~<em>N </em>auxiliary storage required)
-     *
-     * \par
-     * - The sorting operation is given a pair of key buffers and a corresponding
-     *   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
-     *   structure that indicates which of the two buffers is "current" (and thus
-     *   contains the input data to be sorted).
-     * - The contents of both buffers within each pair may be altered by the sorting
-     *   operation.
-     * - Upon completion, the sorting operation will update the "current" indicator
-     *   within each DoubleBuffer wrapper to reference which of the two buffers
-     *   now contains the sorted output sequence (a function of the number of key bits
-     *   specified and the targeted device architecture).
-     * - When input a contiguous sequence of segments, a single sequence
-     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
-     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
-     *   the latter is specified as <tt>segment_offsets+1</tt>).
-     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
-     * - \devicestorageP
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys
-     * with associated vector of \p int values.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmentd_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for sorting data
-     * int  num_items;          // e.g., 7
-     * int  num_segments;       // e.g., 3
-     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
-     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
-     * int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
-     * int  *d_value_alt_buf;   // e.g., [-, -, -, -, -, -, -]
-     * ...
-     *
-     * // Create a set of DoubleBuffers to wrap pairs of device pointers
-     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
-     * cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values,
-     *     num_items, num_segments, d_offsets, d_offsets + 1);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sorting operation
-     * cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values,
-     *     num_items, num_segments, d_offsets, d_offsets + 1);
-     *
-     * // d_keys.Current()      <-- [6, 7, 8, 0, 3, 5, 9]
-     * // d_values.Current()    <-- [5, 4, 3, 1, 2, 0, 6]
-     *
-     * \endcode
-     *
-     * \tparam KeyT             <b>[inferred]</b> Key type
-     * \tparam ValueT           <b>[inferred]</b> Value type
-     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
-     */
-    template <
-        typename                KeyT,
-        typename                ValueT,
-        typename                OffsetIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t SortPairs(
-        void                    *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                  &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        DoubleBuffer<KeyT>      &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
-        DoubleBuffer<ValueT>    &d_values,                              ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
-        int                     num_items,                              ///< [in] The total number of items to sort (across all segments)
-        int                     num_segments,                           ///< [in] The number of segments that comprise the sorting data
-        OffsetIteratorT         d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-        OffsetIteratorT         d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-        int                     begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
-        int                     end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
-        cudaStream_t            stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                    debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        return DispatchSegmentedRadixSort<false, KeyT, ValueT, OffsetIteratorT, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys,
-            d_values,
-            num_items,
-            num_segments,
-            d_begin_offsets,
-            d_end_offsets,
-            begin_bit,
-            end_bit,
-            true,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Sorts segments of key-value pairs into descending order. (~<em>2N</em> auxiliary storage required).
-     *
-     * \par
-     * - The contents of the input data are not altered by the sorting operation
-     * - When input a contiguous sequence of segments, a single sequence
-     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
-     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
-     *   the latter is specified as <tt>segment_offsets+1</tt>).
-     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
-     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys
-     * with associated vector of \p int values.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmentd_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for sorting data
-     * int  num_items;          // e.g., 7
-     * int  num_segments;       // e.g., 3
-     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
-     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
-     * int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
-     * int  *d_values_out;      // e.g., [-, -, -, -, -, -, -]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes,
-     *     d_keys_in, d_keys_out, d_values_in, d_values_out,
-     *     num_items, num_segments, d_offsets, d_offsets + 1);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sorting operation
-     * cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes,
-     *     d_keys_in, d_keys_out, d_values_in, d_values_out,
-     *     num_items, num_segments, d_offsets, d_offsets + 1);
-     *
-     * // d_keys_out            <-- [8, 7, 6, 9, 5, 3, 0]
-     * // d_values_out          <-- [0, 2, 1, 6, 3, 4, 5]
-     *
-     * \endcode
-     *
-     * \tparam KeyT             <b>[inferred]</b> Key type
-     * \tparam ValueT           <b>[inferred]</b> Value type
-     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
-     */
-    template <
-        typename            KeyT,
-        typename            ValueT,
-        typename            OffsetIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t SortPairsDescending(
-        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        const KeyT          *d_keys_in,                             ///< [in] %Device-accessible pointer to the input data of key data to sort
-        KeyT                *d_keys_out,                            ///< [out] %Device-accessible pointer to the sorted output sequence of key data
-        const ValueT        *d_values_in,                           ///< [in] %Device-accessible pointer to the corresponding input sequence of associated value items
-        ValueT              *d_values_out,                          ///< [out] %Device-accessible pointer to the correspondingly-reordered output sequence of associated value items
-        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
-        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
-        OffsetIteratorT     d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-        OffsetIteratorT     d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
-        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
-        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        DoubleBuffer<KeyT>       d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
-        DoubleBuffer<ValueT>     d_values(const_cast<ValueT*>(d_values_in), d_values_out);
-
-        return DispatchSegmentedRadixSort<true, KeyT, ValueT, OffsetIteratorT, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys,
-            d_values,
-            num_items,
-            num_segments,
-            d_begin_offsets,
-            d_end_offsets,
-            begin_bit,
-            end_bit,
-            false,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Sorts segments of key-value pairs into descending order. (~<em>N </em>auxiliary storage required).
-     *
-     * \par
-     * - The sorting operation is given a pair of key buffers and a corresponding
-     *   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
-     *   structure that indicates which of the two buffers is "current" (and thus
-     *   contains the input data to be sorted).
-     * - The contents of both buffers within each pair may be altered by the sorting
-     *   operation.
-     * - Upon completion, the sorting operation will update the "current" indicator
-     *   within each DoubleBuffer wrapper to reference which of the two buffers
-     *   now contains the sorted output sequence (a function of the number of key bits
-     *   specified and the targeted device architecture).
-     * - When input a contiguous sequence of segments, a single sequence
-     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
-     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
-     *   the latter is specified as <tt>segment_offsets+1</tt>).
-     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
-     * - \devicestorageP
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys
-     * with associated vector of \p int values.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmentd_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for sorting data
-     * int  num_items;          // e.g., 7
-     * int  num_segments;       // e.g., 3
-     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
-     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
-     * int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
-     * int  *d_value_alt_buf;   // e.g., [-, -, -, -, -, -, -]
-     * ...
-     *
-     * // Create a set of DoubleBuffers to wrap pairs of device pointers
-     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
-     * cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values,
-     *     num_items, num_segments, d_offsets, d_offsets + 1);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sorting operation
-     * cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values,
-     *     num_items, num_segments, d_offsets, d_offsets + 1);
-     *
-     * // d_keys.Current()      <-- [8, 7, 6, 9, 5, 3, 0]
-     * // d_values.Current()    <-- [0, 2, 1, 6, 3, 4, 5]
-     *
-     * \endcode
-     *
-     * \tparam KeyT             <b>[inferred]</b> Key type
-     * \tparam ValueT           <b>[inferred]</b> Value type
-     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
-     */
-    template <
-        typename                KeyT,
-        typename                ValueT,
-        typename                OffsetIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t SortPairsDescending(
-        void                    *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                  &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        DoubleBuffer<KeyT>      &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
-        DoubleBuffer<ValueT>    &d_values,                              ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
-        int                     num_items,                              ///< [in] The total number of items to sort (across all segments)
-        int                     num_segments,                           ///< [in] The number of segments that comprise the sorting data
-        OffsetIteratorT         d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-        OffsetIteratorT         d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-        int                     begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
-        int                     end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
-        cudaStream_t            stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                    debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        return DispatchSegmentedRadixSort<true, KeyT, ValueT, OffsetIteratorT, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys,
-            d_values,
-            num_items,
-            num_segments,
-            d_begin_offsets,
-            d_end_offsets,
-            begin_bit,
-            end_bit,
-            true,
-            stream,
-            debug_synchronous);
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Keys-only
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Sorts segments of keys into ascending order. (~<em>2N </em>auxiliary storage required)
-     *
-     * \par
-     * - The contents of the input data are not altered by the sorting operation
-     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
-     * - When input a contiguous sequence of segments, a single sequence
-     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
-     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
-     *   the latter is specified as <tt>segment_offsets+1</tt>).
-     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmentd_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for sorting data
-     * int  num_items;          // e.g., 7
-     * int  num_segments;       // e.g., 3
-     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
-     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSegmentedRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
-     *     num_items, num_segments, d_offsets, d_offsets + 1);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sorting operation
-     * cub::DeviceSegmentedRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
-     *     num_items, num_segments, d_offsets, d_offsets + 1);
-     *
-     * // d_keys_out            <-- [6, 7, 8, 0, 3, 5, 9]
-     *
-     * \endcode
-     *
-     * \tparam KeyT             <b>[inferred]</b> Key type
-     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
-     */
-    template <
-        typename            KeyT,
-        typename            OffsetIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t SortKeys(
-        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        const KeyT          *d_keys_in,                             ///< [in] %Device-accessible pointer to the input data of key data to sort
-        KeyT                *d_keys_out,                            ///< [out] %Device-accessible pointer to the sorted output sequence of key data
-        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
-        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
-        OffsetIteratorT     d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-        OffsetIteratorT     d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
-        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
-        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        // Null value type
-        DoubleBuffer<KeyT>      d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
-        DoubleBuffer<NullType>  d_values;
-
-        return DispatchSegmentedRadixSort<false, KeyT, NullType, OffsetIteratorT, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys,
-            d_values,
-            num_items,
-            num_segments,
-            d_begin_offsets,
-            d_end_offsets,
-            begin_bit,
-            end_bit,
-            false,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Sorts segments of keys into ascending order. (~<em>N </em>auxiliary storage required).
-     *
-     * \par
-     * - The sorting operation is given a pair of key buffers managed by a
-     *   DoubleBuffer structure that indicates which of the two buffers is
-     *   "current" (and thus contains the input data to be sorted).
-     * - The contents of both buffers may be altered by the sorting operation.
-     * - Upon completion, the sorting operation will update the "current" indicator
-     *   within the DoubleBuffer wrapper to reference which of the two buffers
-     *   now contains the sorted output sequence (a function of the number of key bits
-     *   specified and the targeted device architecture).
-     * - When input a contiguous sequence of segments, a single sequence
-     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
-     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
-     *   the latter is specified as <tt>segment_offsets+1</tt>).
-     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
-     * - \devicestorageP
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmentd_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for sorting data
-     * int  num_items;          // e.g., 7
-     * int  num_segments;       // e.g., 3
-     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
-     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
-     * ...
-     *
-     * // Create a DoubleBuffer to wrap the pair of device pointers
-     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSegmentedRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys,
-     *     num_items, num_segments, d_offsets, d_offsets + 1);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sorting operation
-     * cub::DeviceSegmentedRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys,
-     *     num_items, num_segments, d_offsets, d_offsets + 1);
-     *
-     * // d_keys.Current()      <-- [6, 7, 8, 0, 3, 5, 9]
-     *
-     * \endcode
-     *
-     * \tparam KeyT             <b>[inferred]</b> Key type
-     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
-     */
-    template <
-        typename            KeyT,
-        typename            OffsetIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t SortKeys(
-        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        DoubleBuffer<KeyT>  &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
-        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
-        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
-        OffsetIteratorT     d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-        OffsetIteratorT     d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
-        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
-        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        // Null value type
-        DoubleBuffer<NullType> d_values;
-
-        return DispatchSegmentedRadixSort<false, KeyT, NullType, OffsetIteratorT, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys,
-            d_values,
-            num_items,
-            num_segments,
-            d_begin_offsets,
-            d_end_offsets,
-            begin_bit,
-            end_bit,
-            true,
-            stream,
-            debug_synchronous);
-    }
-
-    /**
-     * \brief Sorts segments of keys into descending order. (~<em>2N</em> auxiliary storage required).
-     *
-     * \par
-     * - The contents of the input data are not altered by the sorting operation
-     * - When input a contiguous sequence of segments, a single sequence
-     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
-     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
-     *   the latter is specified as <tt>segment_offsets+1</tt>).
-     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
-     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmentd_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for sorting data
-     * int  num_items;          // e.g., 7
-     * int  num_segments;       // e.g., 3
-     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
-     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
-     * ...
-     *
-     * // Create a DoubleBuffer to wrap the pair of device pointers
-     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSegmentedRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
-     *     num_items, num_segments, d_offsets, d_offsets + 1);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sorting operation
-     * cub::DeviceSegmentedRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
-     *     num_items, num_segments, d_offsets, d_offsets + 1);
-     *
-     * // d_keys_out            <-- [8, 7, 6, 9, 5, 3, 0]
-     *
-     * \endcode
-     *
-     * \tparam KeyT             <b>[inferred]</b> Key type
-     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
-     */
-    template <
-        typename            KeyT,
-        typename            OffsetIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t SortKeysDescending(
-        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        const KeyT          *d_keys_in,                             ///< [in] %Device-accessible pointer to the input data of key data to sort
-        KeyT                *d_keys_out,                            ///< [out] %Device-accessible pointer to the sorted output sequence of key data
-        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
-        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
-        OffsetIteratorT     d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-        OffsetIteratorT     d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
-        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
-        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        DoubleBuffer<KeyT>      d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
-        DoubleBuffer<NullType>  d_values;
-
-        return DispatchSegmentedRadixSort<true, KeyT, NullType, OffsetIteratorT, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys,
-            d_values,
-            num_items,
-            num_segments,
-            d_begin_offsets,
-            d_end_offsets,
-            begin_bit,
-            end_bit,
-            false,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Sorts segments of keys into descending order. (~<em>N </em>auxiliary storage required).
-     *
-     * \par
-     * - The sorting operation is given a pair of key buffers managed by a
-     *   DoubleBuffer structure that indicates which of the two buffers is
-     *   "current" (and thus contains the input data to be sorted).
-     * - The contents of both buffers may be altered by the sorting operation.
-     * - Upon completion, the sorting operation will update the "current" indicator
-     *   within the DoubleBuffer wrapper to reference which of the two buffers
-     *   now contains the sorted output sequence (a function of the number of key bits
-     *   specified and the targeted device architecture).
-     * - When input a contiguous sequence of segments, a single sequence
-     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
-     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
-     *   the latter is specified as <tt>segment_offsets+1</tt>).
-     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
-     * - \devicestorageP
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmentd_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for sorting data
-     * int  num_items;          // e.g., 7
-     * int  num_segments;       // e.g., 3
-     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
-     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
-     * ...
-     *
-     * // Create a DoubleBuffer to wrap the pair of device pointers
-     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSegmentedRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys,
-     *     num_items, num_segments, d_offsets, d_offsets + 1);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sorting operation
-     * cub::DeviceSegmentedRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys,
-     *     num_items, num_segments, d_offsets, d_offsets + 1);
-     *
-     * // d_keys.Current()      <-- [8, 7, 6, 9, 5, 3, 0]
-     *
-     * \endcode
-     *
-     * \tparam KeyT             <b>[inferred]</b> Key type
-     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
-     */
-    template <
-        typename            KeyT,
-        typename            OffsetIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t SortKeysDescending(
-        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        DoubleBuffer<KeyT>  &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
-        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
-        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
-        OffsetIteratorT     d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-        OffsetIteratorT     d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
-        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
-        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        // Null value type
-        DoubleBuffer<NullType> d_values;
-
-        return DispatchSegmentedRadixSort<true, KeyT, NullType, OffsetIteratorT, OffsetT>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys,
-            d_values,
-            num_items,
-            num_segments,
-            d_begin_offsets,
-            d_end_offsets,
-            begin_bit,
-            end_bit,
-            true,
-            stream,
-            debug_synchronous);
-    }
-
-
-    //@}  end member group
-
-
-};
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/thrust/system/cuda/detail/cub/device/device_segmented_reduce.cuh b/thrust/system/cuda/detail/cub/device/device_segmented_reduce.cuh
deleted file mode 100644
index 5626e0a00..000000000
--- a/thrust/system/cuda/detail/cub/device/device_segmented_reduce.cuh
+++ /dev/null
@@ -1,619 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DeviceSegmentedReduce provides device-wide, parallel operations for computing a batched reduction across multiple sequences of data items residing within device-accessible memory.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-
-#include "../iterator/arg_index_input_iterator.cuh"
-#include "dispatch/dispatch_reduce.cuh"
-#include "dispatch/dispatch_reduce_by_key.cuh"
-#include "../util_type.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \brief DeviceSegmentedReduce provides device-wide, parallel operations for computing a reduction across multiple sequences of data items residing within device-accessible memory. ![](reduce_logo.png)
- * \ingroup SegmentedModule
- *
- * \par Overview
- * A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
- * uses a binary combining operator to compute a single aggregate from a sequence of input elements.
- *
- * \par Usage Considerations
- * \cdp_class{DeviceSegmentedReduce}
- *
- */
-struct DeviceSegmentedReduce
-{
-    /**
-     * \brief Computes a device-wide segmented reduction using the specified binary \p reduction_op functor.
-     *
-     * \par
-     * - Does not support binary reduction operators that are non-commutative.
-     * - When input a contiguous sequence of segments, a single sequence
-     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
-     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
-     *   the latter is specified as <tt>segment_offsets+1</tt>).
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates a custom min-reduction of a device vector of \p int data elements.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // CustomMin functor
-     * struct CustomMin
-     * {
-     *     template <typename T>
-     *     CUB_RUNTIME_FUNCTION __forceinline__
-     *     T operator()(const T &a, const T &b) const {
-     *         return (b < a) ? b : a;
-     *     }
-     * };
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int          num_segments;   // e.g., 3
-     * int          *d_offsets;     // e.g., [0, 3, 3, 7]
-     * int          *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int          *d_out;         // e.g., [-, -, -]
-     * CustomMin    min_op;
-     * int          initial_value;           // e.g., INT_MAX
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSegmentedReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out,
-     *     num_segments, d_offsets, d_offsets + 1, min_op, initial_value);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run reduction
-     * cub::DeviceSegmentedReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out,
-     *     num_segments, d_offsets, d_offsets + 1, min_op, initial_value);
-     *
-     * // d_out <-- [6, INT_MAX, 0]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
-     * \tparam OutputIteratorT      <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
-     * \tparam OffsetIteratorT      <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
-     * \tparam ReductionOp          <b>[inferred]</b> Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
-     * \tparam T                    <b>[inferred]</b> Data element type that is convertible to the \p value type of \p InputIteratorT
-     */
-    template <
-        typename            InputIteratorT,
-        typename            OutputIteratorT,
-        typename            OffsetIteratorT,
-        typename            ReductionOp,
-        typename            T>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t Reduce(
-        void                *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT      d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT     d_out,                              ///< [out] Pointer to the output aggregate
-        int                 num_segments,                       ///< [in] The number of segments that comprise the sorting data
-        OffsetIteratorT     d_begin_offsets,                    ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-        OffsetIteratorT     d_end_offsets,                      ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-        ReductionOp         reduction_op,                       ///< [in] Binary reduction functor 
-        T                   initial_value,                      ///< [in] Initial value of the reduction for each segment
-        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        return DispatchSegmentedReduce<InputIteratorT, OutputIteratorT, OffsetIteratorT, OffsetT, ReductionOp>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_out,
-            num_segments,
-            d_begin_offsets,
-            d_end_offsets,
-            reduction_op,
-            initial_value,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Computes a device-wide segmented sum using the addition ('+') operator.
-     *
-     * \par
-     * - Uses \p 0 as the initial value of the reduction for each segment.
-     * - When input a contiguous sequence of segments, a single sequence
-     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
-     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
-     *   the latter is specified as <tt>segment_offsets+1</tt>).
-     * - Does not support \p + operators that are non-commutative..
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the sum reduction of a device vector of \p int data elements.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int num_segments;   // e.g., 3
-     * int *d_offsets;     // e.g., [0, 3, 3, 7]
-     * int *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int *d_out;         // e.g., [-, -, -]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSegmentedReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out,
-     *     num_segments, d_offsets, d_offsets + 1);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sum-reduction
-     * cub::DeviceSegmentedReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out,
-     *     num_segments, d_offsets, d_offsets + 1);
-     *
-     * // d_out <-- [21, 0, 17]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
-     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
-     * \tparam OffsetIteratorT      <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
-     */
-    template <
-        typename            InputIteratorT,
-        typename            OutputIteratorT,
-        typename            OffsetIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t Sum(
-        void                *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT      d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT     d_out,                              ///< [out] Pointer to the output aggregate
-        int                 num_segments,                       ///< [in] The number of segments that comprise the sorting data
-        OffsetIteratorT     d_begin_offsets,                    ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-        OffsetIteratorT     d_end_offsets,                      ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        // The output value type
-        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-            typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
-            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
-
-        return DispatchSegmentedReduce<InputIteratorT,  OutputIteratorT, OffsetIteratorT, OffsetT, cub::Sum>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_out,
-            num_segments,
-            d_begin_offsets,
-            d_end_offsets,
-            cub::Sum(),
-            OutputT(),            // zero-initialize
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Computes a device-wide segmented minimum using the less-than ('<') operator.
-     *
-     * \par
-     * - Uses <tt>std::numeric_limits<T>::max()</tt> as the initial value of the reduction for each segment.
-     * - When input a contiguous sequence of segments, a single sequence
-     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
-     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
-     *   the latter is specified as <tt>segment_offsets+1</tt>).
-     * - Does not support \p < operators that are non-commutative.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the min-reduction of a device vector of \p int data elements.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int num_segments;   // e.g., 3
-     * int *d_offsets;     // e.g., [0, 3, 3, 7]
-     * int *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int *d_out;         // e.g., [-, -, -]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSegmentedReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out,
-     *     num_segments, d_offsets, d_offsets + 1);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run min-reduction
-     * cub::DeviceSegmentedReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out,
-     *     num_segments, d_offsets, d_offsets + 1);
-     *
-     * // d_out <-- [6, INT_MAX, 0]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
-     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
-     * \tparam OffsetIteratorT      <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
-     */
-    template <
-        typename            InputIteratorT,
-        typename            OutputIteratorT,
-        typename            OffsetIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t Min(
-        void                *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT      d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT     d_out,                              ///< [out] Pointer to the output aggregate
-        int                 num_segments,                       ///< [in] The number of segments that comprise the sorting data
-        OffsetIteratorT     d_begin_offsets,                    ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-        OffsetIteratorT     d_end_offsets,                      ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        // The input value type
-        typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
-
-        return DispatchSegmentedReduce<InputIteratorT,  OutputIteratorT, OffsetIteratorT, OffsetT, cub::Min>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_out,
-            num_segments,
-            d_begin_offsets,
-            d_end_offsets,
-            cub::Min(),
-            Traits<InputT>::Max(),    // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Finds the first device-wide minimum in each segment using the less-than ('<') operator, also returning the in-segment index of that item.
-     *
-     * \par
-     * - The output value type of \p d_out is cub::KeyValuePair <tt><int, T></tt> (assuming the value type of \p d_in is \p T)
-     *   - The minimum of the <em>i</em><sup>th</sup> segment is written to <tt>d_out[i].value</tt> and its offset in that segment is written to <tt>d_out[i].key</tt>.
-     *   - The <tt>{1, std::numeric_limits<T>::max()}</tt> tuple is produced for zero-length inputs
-     * - When input a contiguous sequence of segments, a single sequence
-     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
-     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
-     *   the latter is specified as <tt>segment_offsets+1</tt>).
-     * - Does not support \p < operators that are non-commutative.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the argmin-reduction of a device vector of \p int data elements.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int                      num_segments;   // e.g., 3
-     * int                      *d_offsets;     // e.g., [0, 3, 3, 7]
-     * int                      *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * KeyValuePair<int, int>   *d_out;         // e.g., [{-,-}, {-,-}, {-,-}]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSegmentedReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out,
-     *     num_segments, d_offsets, d_offsets + 1);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run argmin-reduction
-     * cub::DeviceSegmentedReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out,
-     *     num_segments, d_offsets, d_offsets + 1);
-     *
-     * // d_out <-- [{1,6}, {1,INT_MAX}, {2,0}]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items (of some type \p T) \iterator
-     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate (having value type <tt>KeyValuePair<int, T></tt>) \iterator
-     * \tparam OffsetIteratorT      <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
-     */
-    template <
-        typename            InputIteratorT,
-        typename            OutputIteratorT,
-        typename            OffsetIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t ArgMin(
-        void                *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT      d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT     d_out,                              ///< [out] Pointer to the output aggregate
-        int                 num_segments,                       ///< [in] The number of segments that comprise the sorting data
-        OffsetIteratorT     d_begin_offsets,                    ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-        OffsetIteratorT     d_end_offsets,                      ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        // The input type
-        typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;
-
-        // The output tuple type
-        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-            KeyValuePair<OffsetT, InputValueT>,                                                                 // ... then the key value pair OffsetT + InputValueT
-            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputTupleT;                     // ... else the output iterator's value type
-
-        // The output value type
-        typedef typename OutputTupleT::Value OutputValueT;
-
-        // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
-        typedef ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT> ArgIndexInputIteratorT;
-        ArgIndexInputIteratorT d_indexed_in(d_in);
-
-        // Initial value
-        OutputTupleT initial_value(1, Traits<InputValueT>::Max());   // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
-
-        return DispatchSegmentedReduce<ArgIndexInputIteratorT,  OutputIteratorT, OffsetIteratorT, OffsetT, cub::ArgMin>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_indexed_in,
-            d_out,
-            num_segments,
-            d_begin_offsets,
-            d_end_offsets,
-            cub::ArgMin(),
-            initial_value,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Computes a device-wide segmented maximum using the greater-than ('>') operator.
-     *
-     * \par
-     * - Uses <tt>std::numeric_limits<T>::lowest()</tt> as the initial value of the reduction.
-     * - When input a contiguous sequence of segments, a single sequence
-     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
-     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
-     *   the latter is specified as <tt>segment_offsets+1</tt>).
-     * - Does not support \p > operators that are non-commutative.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the max-reduction of a device vector of \p int data elements.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int num_segments;   // e.g., 3
-     * int *d_offsets;     // e.g., [0, 3, 3, 7]
-     * int *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int *d_out;         // e.g., [-, -, -]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSegmentedReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_out,
-     *     num_segments, d_offsets, d_offsets + 1);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run max-reduction
-     * cub::DeviceSegmentedReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_out,
-     *     num_segments, d_offsets, d_offsets + 1);
-     *
-     * // d_out <-- [8, INT_MIN, 9]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
-     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
-     * \tparam OffsetIteratorT      <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
-     */
-    template <
-        typename            InputIteratorT,
-        typename            OutputIteratorT,
-        typename            OffsetIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t Max(
-        void                *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT      d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT     d_out,                              ///< [out] Pointer to the output aggregate
-        int                 num_segments,                       ///< [in] The number of segments that comprise the sorting data
-        OffsetIteratorT     d_begin_offsets,                    ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-        OffsetIteratorT     d_end_offsets,                      ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        // The input value type
-        typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
-
-        return DispatchSegmentedReduce<InputIteratorT,  OutputIteratorT, OffsetIteratorT, OffsetT, cub::Max>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_out,
-            num_segments,
-            d_begin_offsets,
-            d_end_offsets,
-            cub::Max(),
-            Traits<InputT>::Lowest(),    // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Finds the first device-wide maximum in each segment using the greater-than ('>') operator, also returning the in-segment index of that item
-     *
-     * \par
-     * - The output value type of \p d_out is cub::KeyValuePair <tt><int, T></tt> (assuming the value type of \p d_in is \p T)
-     *   - The maximum of the <em>i</em><sup>th</sup> segment is written to <tt>d_out[i].value</tt> and its offset in that segment is written to <tt>d_out[i].key</tt>.
-     *   - The <tt>{1, std::numeric_limits<T>::lowest()}</tt> tuple is produced for zero-length inputs
-     * - When input a contiguous sequence of segments, a single sequence
-     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
-     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
-     *   the latter is specified as <tt>segment_offsets+1</tt>).
-     * - Does not support \p > operators that are non-commutative.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the argmax-reduction of a device vector of \p int data elements.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_reduce.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int                      num_segments;   // e.g., 3
-     * int                      *d_offsets;     // e.g., [0, 3, 3, 7]
-     * int                      *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * KeyValuePair<int, int>   *d_out;         // e.g., [{-,-}, {-,-}, {-,-}]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSegmentedReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_out,
-     *     num_segments, d_offsets, d_offsets + 1);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run argmax-reduction
-     * cub::DeviceSegmentedReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_out,
-     *     num_segments, d_offsets, d_offsets + 1);
-     *
-     * // d_out <-- [{0,8}, {1,INT_MIN}, {3,9}]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items (of some type \p T) \iterator
-     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate (having value type <tt>KeyValuePair<int, T></tt>) \iterator
-     * \tparam OffsetIteratorT    <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
-     */
-    template <
-        typename            InputIteratorT,
-        typename            OutputIteratorT,
-        typename            OffsetIteratorT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t ArgMax(
-        void                *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT      d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT     d_out,                              ///< [out] Pointer to the output aggregate
-        int                 num_segments,                       ///< [in] The number of segments that comprise the sorting data
-        OffsetIteratorT     d_begin_offsets,                    ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-        OffsetIteratorT     d_end_offsets,                      ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int OffsetT;
-
-        // The input type
-        typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;
-
-        // The output tuple type
-        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-            KeyValuePair<OffsetT, InputValueT>,                                                                 // ... then the key value pair OffsetT + InputValueT
-            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputTupleT;                     // ... else the output iterator's value type
-
-        // The output value type
-        typedef typename OutputTupleT::Value OutputValueT;
-
-        // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
-        typedef ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT> ArgIndexInputIteratorT;
-        ArgIndexInputIteratorT d_indexed_in(d_in);
-
-        // Initial value
-        OutputTupleT initial_value(1, Traits<InputValueT>::Lowest());     // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
-
-        return DispatchSegmentedReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetIteratorT, OffsetT, cub::ArgMax>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_indexed_in,
-            d_out,
-            num_segments,
-            d_begin_offsets,
-            d_end_offsets,
-            cub::ArgMax(),
-            initial_value,
-            stream,
-            debug_synchronous);
-    }
-
-};
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/thrust/system/cuda/detail/cub/device/device_select.cuh b/thrust/system/cuda/detail/cub/device/device_select.cuh
deleted file mode 100644
index 3dc9d6ac3..000000000
--- a/thrust/system/cuda/detail/cub/device/device_select.cuh
+++ /dev/null
@@ -1,369 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DeviceSelect provides device-wide, parallel operations for compacting selected items from sequences of data items residing within device-accessible memory.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-
-#include "dispatch/dispatch_select_if.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \brief DeviceSelect provides device-wide, parallel operations for compacting selected items from sequences of data items residing within device-accessible memory. ![](select_logo.png)
- * \ingroup SingleModule
- *
- * \par Overview
- * These operations apply a selection criterion to selectively copy
- * items from a specified input sequence to a compact output sequence.
- *
- * \par Usage Considerations
- * \cdp_class{DeviceSelect}
- *
- * \par Performance
- * \linear_performance{select-flagged, select-if, and select-unique}
- *
- * \par
- * The following chart illustrates DeviceSelect::If
- * performance across different CUDA architectures for \p int32 items,
- * where 50% of the items are randomly selected.
- *
- * \image html select_if_int32_50_percent.png
- *
- * \par
- * The following chart illustrates DeviceSelect::Unique
- * performance across different CUDA architectures for \p int32 items
- * where segments have lengths uniformly sampled from [1,1000].
- *
- * \image html select_unique_int32_len_500.png
- *
- * \par
- * \plots_below
- *
- */
-struct DeviceSelect
-{
-    /**
-     * \brief Uses the \p d_flags sequence to selectively copy the corresponding items from \p d_in into \p d_out.  The total number of items selected is written to \p d_num_selected_out. ![](select_flags_logo.png)
-     *
-     * \par
-     * - The value type of \p d_flags must be castable to \p bool (e.g., \p bool, \p char, \p int, etc.).
-     * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering.
-     * - \devicestorage
-     *
-     * \par Snippet
-     * The code snippet below illustrates the compaction of items selected from an \p int device vector.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>       // or equivalently <cub/device/device_select.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input, flags, and output
-     * int  num_items;              // e.g., 8
-     * int  *d_in;                  // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
-     * char *d_flags;               // e.g., [1, 0, 0, 1, 0, 1, 1, 0]
-     * int  *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
-     * int  *d_num_selected_out;    // e.g., [ ]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run selection
-     * cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items);
-     *
-     * // d_out                 <-- [1, 4, 6, 7]
-     * // d_num_selected_out    <-- [4]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
-     * \tparam FlagIterator         <b>[inferred]</b> Random-access input iterator type for reading selection flags \iterator
-     * \tparam OutputIteratorT      <b>[inferred]</b> Random-access output iterator type for writing selected items \iterator
-     * \tparam NumSelectedIteratorT  <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
-     */
-    template <
-        typename                    InputIteratorT,
-        typename                    FlagIterator,
-        typename                    OutputIteratorT,
-        typename                    NumSelectedIteratorT>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Flagged(
-        void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
-        FlagIterator                d_flags,                        ///< [in] Pointer to the input sequence of selection flags
-        OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of selected data items
-        NumSelectedIteratorT         d_num_selected_out,                 ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out)
-        int                         num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
-        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        typedef int                     OffsetT;         // Signed integer type for global offsets
-        typedef NullType                SelectOp;       // Selection op (not used)
-        typedef NullType                EqualityOp;     // Equality operator (not used)
-
-        return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, false>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_flags,
-            d_out,
-            d_num_selected_out,
-            SelectOp(),
-            EqualityOp(),
-            num_items,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Uses the \p select_op functor to selectively copy items from \p d_in into \p d_out.  The total number of items selected is written to \p d_num_selected_out. ![](select_logo.png)
-     *
-     * \par
-     * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering.
-     * - \devicestorage
-     *
-     * \par Performance
-     * The following charts illustrate saturated select-if performance across different
-     * CUDA architectures for \p int32 and \p int64 items, respectively.  Items are
-     * selected with 50% probability.
-     *
-     * \image html select_if_int32_50_percent.png
-     * \image html select_if_int64_50_percent.png
-     *
-     * \par
-     * The following charts are similar, but 5% selection probability:
-     *
-     * \image html select_if_int32_5_percent.png
-     * \image html select_if_int64_5_percent.png
-     *
-     * \par Snippet
-     * The code snippet below illustrates the compaction of items selected from an \p int device vector.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_select.cuh>
-     *
-     * // Functor type for selecting values less than some criteria
-     * struct LessThan
-     * {
-     *     int compare;
-     *
-     *     CUB_RUNTIME_FUNCTION __forceinline__
-     *     LessThan(int compare) : compare(compare) {}
-     *
-     *     CUB_RUNTIME_FUNCTION __forceinline__
-     *     bool operator()(const int &a) const {
-     *         return (a < compare);
-     *     }
-     * };
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int      num_items;              // e.g., 8
-     * int      *d_in;                  // e.g., [0, 2, 3, 9, 5, 2, 81, 8]
-     * int      *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
-     * int      *d_num_selected_out;    // e.g., [ ]
-     * LessThan select_op(7);
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run selection
-     * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op);
-     *
-     * // d_out                 <-- [0, 2, 3, 5, 2]
-     * // d_num_selected_out    <-- [5]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
-     * \tparam OutputIteratorT      <b>[inferred]</b> Random-access output iterator type for writing selected items \iterator
-     * \tparam NumSelectedIteratorT  <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
-     * \tparam SelectOp             <b>[inferred]</b> Selection operator type having member <tt>bool operator()(const T &a)</tt>
-     */
-    template <
-        typename                    InputIteratorT,
-        typename                    OutputIteratorT,
-        typename                    NumSelectedIteratorT,
-        typename                    SelectOp>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t If(
-        void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of selected data items
-        NumSelectedIteratorT         d_num_selected_out,                 ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out)
-        int                         num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
-        SelectOp                    select_op,                      ///< [in] Unary selection operator
-        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        typedef int                     OffsetT;         // Signed integer type for global offsets
-        typedef NullType*               FlagIterator;   // FlagT iterator type (not used)
-        typedef NullType                EqualityOp;     // Equality operator (not used)
-
-        return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, false>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            NULL,
-            d_out,
-            d_num_selected_out,
-            select_op,
-            EqualityOp(),
-            num_items,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Given an input sequence \p d_in having runs of consecutive equal-valued keys, only the first key from each run is selectively copied to \p d_out.  The total number of items selected is written to \p d_num_selected_out. ![](unique_logo.png)
-     *
-     * \par
-     * - The <tt>==</tt> equality operator is used to determine whether keys are equivalent
-     * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering.
-     * - \devicestorage
-     *
-     * \par Performance
-     * The following charts illustrate saturated select-unique performance across different
-     * CUDA architectures for \p int32 and \p int64 items, respectively.  Segments have
-     * lengths uniformly sampled from [1,1000].
-     *
-     * \image html select_unique_int32_len_500.png
-     * \image html select_unique_int64_len_500.png
-     *
-     * \par
-     * The following charts are similar, but with segment lengths uniformly sampled from [1,10]:
-     *
-     * \image html select_unique_int32_len_5.png
-     * \image html select_unique_int64_len_5.png
-     *
-     * \par Snippet
-     * The code snippet below illustrates the compaction of items selected from an \p int device vector.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>       // or equivalently <cub/device/device_select.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input and output
-     * int  num_items;              // e.g., 8
-     * int  *d_in;                  // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
-     * int  *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
-     * int  *d_num_selected_out;    // e.g., [ ]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run selection
-     * cub::DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items);
-     *
-     * // d_out                 <-- [0, 2, 9, 5, 8]
-     * // d_num_selected_out    <-- [5]
-     *
-     * \endcode
-     *
-     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
-     * \tparam OutputIteratorT      <b>[inferred]</b> Random-access output iterator type for writing selected items \iterator
-     * \tparam NumSelectedIteratorT  <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
-     */
-    template <
-        typename                    InputIteratorT,
-        typename                    OutputIteratorT,
-        typename                    NumSelectedIteratorT>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Unique(
-        void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of selected data items
-        NumSelectedIteratorT         d_num_selected_out,             ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out)
-        int                         num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
-        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        typedef int                     OffsetT;         // Signed integer type for global offsets
-        typedef NullType*               FlagIterator;   // FlagT iterator type (not used)
-        typedef NullType                SelectOp;       // Selection op (not used)
-        typedef Equality                EqualityOp;     // Default == operator
-
-        return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, false>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            NULL,
-            d_out,
-            d_num_selected_out,
-            SelectOp(),
-            EqualityOp(),
-            num_items,
-            stream,
-            debug_synchronous);
-    }
-
-};
-
-/**
- * \example example_device_select_flagged.cu
- * \example example_device_select_if.cu
- * \example example_device_select_unique.cu
- */
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/thrust/system/cuda/detail/cub/device/device_spmv.cuh b/thrust/system/cuda/detail/cub/device/device_spmv.cuh
deleted file mode 100644
index 611d75d3a..000000000
--- a/thrust/system/cuda/detail/cub/device/device_spmv.cuh
+++ /dev/null
@@ -1,174 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * vector multiplication (SpMV).
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-#include <limits>
-
-#include "dispatch/dispatch_spmv_orig.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \brief DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * dense-vector multiplication (SpMV).
- * \ingroup SingleModule
- *
- * \par Overview
- * The [<em>SpMV computation</em>](http://en.wikipedia.org/wiki/Sparse_matrix-vector_multiplication)
- * performs the matrix-vector operation
- * <em>y</em> = <em>alpha</em>*<b>A</b>*<em>x</em> + <em>beta</em>*<em>y</em>,
- * where:
- *  - <b>A</b> is an <em>m</em>x<em>n</em> sparse matrix whose non-zero structure is specified in
- *    [<em>compressed-storage-row (CSR) format</em>](http://en.wikipedia.org/wiki/Sparse_matrix#Compressed_row_Storage_.28CRS_or_CSR.29)
- *    (i.e., three arrays: <em>values</em>, <em>row_offsets</em>, and <em>column_indices</em>)
- *  - <em>x</em> and <em>y</em> are dense vectors
- *  - <em>alpha</em> and <em>beta</em> are scalar multiplicands
- *
- * \par Usage Considerations
- * \cdp_class{DeviceSpmv}
- *
- */
-struct DeviceSpmv
-{
-    /******************************************************************//**
-     * \name CSR matrix operations
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief This function performs the matrix-vector operation <em>y</em> = <b>A</b>*<em>x</em>.
-     *
-     * \par Snippet
-     * The code snippet below illustrates SpMV upon a 9x9 CSR matrix <b>A</b>
-     * representing a 3x3 lattice (24 non-zeros).
-     *
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_spmv.cuh>
-     *
-     * // Declare, allocate, and initialize device-accessible pointers for input matrix A, input vector x,
-     * // and output vector y
-     * int    num_rows = 9;
-     * int    num_cols = 9;
-     * int    num_nonzeros = 24;
-     *
-     * float* d_values;  // e.g., [1, 1, 1, 1, 1, 1, 1, 1,
-     *                   //        1, 1, 1, 1, 1, 1, 1, 1,
-     *                   //        1, 1, 1, 1, 1, 1, 1, 1]
-     *
-     * int*   d_column_indices; // e.g., [1, 3, 0, 2, 4, 1, 5, 0,
-     *                          //        4, 6, 1, 3, 5, 7, 2, 4,
-     *                          //        8, 3, 7, 4, 6, 8, 5, 7]
-     *
-     * int*   d_row_offsets;    // e.g., [0, 2, 5, 7, 10, 14, 17, 19, 22, 24]
-     *
-     * float* d_vector_x;       // e.g., [1, 1, 1, 1, 1, 1, 1, 1, 1]
-     * float* d_vector_y;       // e.g., [ ,  ,  ,  ,  ,  ,  ,  ,  ]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void*    d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSpmv::CsrMV(d_temp_storage, temp_storage_bytes, d_values,
-     *     d_row_offsets, d_column_indices, d_vector_x, d_vector_y,
-     *     num_rows, num_cols, num_nonzeros, alpha, beta);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run SpMV
-     * cub::DeviceSpmv::CsrMV(d_temp_storage, temp_storage_bytes, d_values,
-     *     d_row_offsets, d_column_indices, d_vector_x, d_vector_y,
-     *     num_rows, num_cols, num_nonzeros, alpha, beta);
-     *
-     * // d_vector_y <-- [2, 3, 2, 3, 4, 3, 2, 3, 2]
-     *
-     * \endcode
-     *
-     * \tparam ValueT       <b>[inferred]</b> Matrix and vector value type (e.g., /p float, /p double, etc.)
-     */
-    template <
-        typename            ValueT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t CsrMV(
-        void*               d_temp_storage,                     ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&             temp_storage_bytes,                 ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        ValueT*             d_values,                           ///< [in] Pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix <b>A</b>.
-        int*                d_row_offsets,                      ///< [in] Pointer to the array of \p m + 1 offsets demarcating the start of every row in \p d_column_indices and \p d_values (with the final entry being equal to \p num_nonzeros)
-        int*                d_column_indices,                   ///< [in] Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix <b>A</b>.  (Indices are zero-valued.)
-        ValueT*             d_vector_x,                         ///< [in] Pointer to the array of \p num_cols values corresponding to the dense input vector <em>x</em>
-        ValueT*             d_vector_y,                         ///< [out] Pointer to the array of \p num_rows values corresponding to the dense output vector <em>y</em>
-        int                 num_rows,                           ///< [in] number of rows of matrix <b>A</b>.
-        int                 num_cols,                           ///< [in] number of columns of matrix <b>A</b>.
-        int                 num_nonzeros,                       ///< [in] number of nonzero elements of matrix <b>A</b>.
-        cudaStream_t        stream                  = 0,        ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous       = false)    ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        SpmvParams<ValueT, int> spmv_params;
-        spmv_params.d_values             = d_values;
-        spmv_params.d_row_end_offsets    = d_row_offsets + 1;
-        spmv_params.d_column_indices     = d_column_indices;
-        spmv_params.d_vector_x           = d_vector_x;
-        spmv_params.d_vector_y           = d_vector_y;
-        spmv_params.num_rows             = num_rows;
-        spmv_params.num_cols             = num_cols;
-        spmv_params.num_nonzeros         = num_nonzeros;
-        spmv_params.alpha                = 1.0;
-        spmv_params.beta                 = 0.0;
-
-        return DispatchSpmv<ValueT, int>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            spmv_params,
-            stream,
-            debug_synchronous);
-    }
-
-    //@}  end member group
-};
-
-
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/thrust/system/cuda/detail/cub/device/dispatch/dispatch_histogram.cuh b/thrust/system/cuda/detail/cub/device/dispatch/dispatch_histogram.cuh
deleted file mode 100644
index 4bf7d6f85..000000000
--- a/thrust/system/cuda/detail/cub/device/dispatch/dispatch_histogram.cuh
+++ /dev/null
@@ -1,1096 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within device-accessible memory.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-#include <limits>
-
-#include "../../agent/agent_histogram.cuh"
-#include "../../util_debug.cuh"
-#include "../../util_device.cuh"
-#include "../../thread/thread_search.cuh"
-#include "../../grid/grid_queue.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-
-/******************************************************************************
- * Histogram kernel entry points
- *****************************************************************************/
-
-/**
- * Histogram initialization kernel entry point
- */
-template <
-    int                                             NUM_ACTIVE_CHANNELS,            ///< Number of channels actively being histogrammed
-    typename                                        CounterT,                       ///< Integer type for counting sample occurrences per histogram bin
-    typename                                        OffsetT>                        ///< Signed integer type for global offsets
-__global__ void DeviceHistogramInitKernel(
-    ArrayWrapper<int, NUM_ACTIVE_CHANNELS>          num_output_bins_wrapper,        ///< Number of output histogram bins per channel
-    ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS>    d_output_histograms_wrapper,    ///< Histogram counter data having logical dimensions <tt>CounterT[NUM_ACTIVE_CHANNELS][num_bins.array[CHANNEL]]</tt>
-    GridQueue<int>                                  tile_queue)                     ///< Drain queue descriptor for dynamically mapping tile data onto thread blocks
-{
-    if ((threadIdx.x == 0) && (blockIdx.x == 0))
-        tile_queue.ResetDrain();
-
-    int output_bin = (blockIdx.x * blockDim.x) + threadIdx.x;
-
-    #pragma unroll
-    for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
-    {
-        if (output_bin < num_output_bins_wrapper.array[CHANNEL])
-            d_output_histograms_wrapper.array[CHANNEL][output_bin] = 0;
-    }
-}
-
-
-/**
- * Histogram privatized sweep kernel entry point (multi-block).  Computes privatized histograms, one per thread block.
- */
-template <
-    typename                                            AgentHistogramPolicyT,     ///< Parameterized AgentHistogramPolicy tuning policy type
-    int                                                 PRIVATIZED_SMEM_BINS,           ///< Maximum number of histogram bins per channel (e.g., up to 256)
-    int                                                 NUM_CHANNELS,                   ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
-    int                                                 NUM_ACTIVE_CHANNELS,            ///< Number of channels actively being histogrammed
-    typename                                            SampleIteratorT,                ///< The input iterator type. \iterator.
-    typename                                            CounterT,                       ///< Integer type for counting sample occurrences per histogram bin
-    typename                                            PrivatizedDecodeOpT,            ///< The transform operator type for determining privatized counter indices from samples, one for each channel
-    typename                                            OutputDecodeOpT,                ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel
-    typename                                            OffsetT>                        ///< Signed integer type for global offsets
-__launch_bounds__ (int(AgentHistogramPolicyT::BLOCK_THREADS))
-__global__ void DeviceHistogramSweepKernel(
-    SampleIteratorT                                         d_samples,                          ///< Input data to reduce
-    ArrayWrapper<int, NUM_ACTIVE_CHANNELS>                  num_output_bins_wrapper,            ///< The number bins per final output histogram
-    ArrayWrapper<int, NUM_ACTIVE_CHANNELS>                  num_privatized_bins_wrapper,        ///< The number bins per privatized histogram
-    ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS>            d_output_histograms_wrapper,        ///< Reference to final output histograms
-    ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS>            d_privatized_histograms_wrapper,    ///< Reference to privatized histograms
-    ArrayWrapper<OutputDecodeOpT, NUM_ACTIVE_CHANNELS>      output_decode_op_wrapper,           ///< The transform operator for determining output bin-ids from privatized counter indices, one for each channel
-    ArrayWrapper<PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS>  privatized_decode_op_wrapper,       ///< The transform operator for determining privatized counter indices from samples, one for each channel
-    OffsetT                                                 num_row_pixels,                     ///< The number of multi-channel pixels per row in the region of interest
-    OffsetT                                                 num_rows,                           ///< The number of rows in the region of interest
-    OffsetT                                                 row_stride_samples,                 ///< The number of samples between starts of consecutive rows in the region of interest
-    int                                                     tiles_per_row,                      ///< Number of image tiles per row
-    GridQueue<int>                                          tile_queue)                         ///< Drain queue descriptor for dynamically mapping tile data onto thread blocks
-{
-    // Thread block type for compositing input tiles
-    typedef AgentHistogram<
-            AgentHistogramPolicyT,
-            PRIVATIZED_SMEM_BINS,
-            NUM_CHANNELS,
-            NUM_ACTIVE_CHANNELS,
-            SampleIteratorT,
-            CounterT,
-            PrivatizedDecodeOpT,
-            OutputDecodeOpT,
-            OffsetT>
-        AgentHistogramT;
-
-    // Shared memory for AgentHistogram
-    __shared__ typename AgentHistogramT::TempStorage temp_storage;
-
-    AgentHistogramT agent(
-        temp_storage,
-        d_samples,
-        num_output_bins_wrapper.array,
-        num_privatized_bins_wrapper.array,
-        d_output_histograms_wrapper.array,
-        d_privatized_histograms_wrapper.array,
-        output_decode_op_wrapper.array,
-        privatized_decode_op_wrapper.array);
-
-    // Initialize counters
-    agent.InitBinCounters();
-
-    // Consume input tiles
-    agent.ConsumeTiles(
-        num_row_pixels,
-        num_rows,
-        row_stride_samples,
-        tiles_per_row,
-        tile_queue);
-
-    // Store output to global (if necessary)
-    agent.StoreOutput();
-
-}
-
-
-
-
-
-
-/******************************************************************************
- * Dispatch
- ******************************************************************************/
-
-/**
- * Utility class for dispatching the appropriately-tuned kernels for DeviceHistogram
- */
-template <
-    int         NUM_CHANNELS,               ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
-    int         NUM_ACTIVE_CHANNELS,        ///< Number of channels actively being histogrammed
-    typename    SampleIteratorT,            ///< Random-access input iterator type for reading input items \iterator
-    typename    CounterT,                   ///< Integer type for counting sample occurrences per histogram bin
-    typename    LevelT,                     ///< Type for specifying bin level boundaries
-    typename    OffsetT>                    ///< Signed integer type for global offsets
-struct DipatchHistogram
-{
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    /// The sample value type of the input iterator
-    typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
-
-    enum
-    {
-        // Maximum number of bins per channel for which we will use a privatized smem strategy
-        MAX_PRIVATIZED_SMEM_BINS = 256
-    };
-
-
-    //---------------------------------------------------------------------
-    // Transform functors for converting samples to bin-ids
-    //---------------------------------------------------------------------
-
-    // Searches for bin given a list of bin-boundary levels
-    template <typename LevelIteratorT>
-    struct SearchTransform
-    {
-        LevelIteratorT  d_levels;                   // Pointer to levels array
-        int             num_output_levels;          // Number of levels in array
-
-        // Initializer
-        __host__ __device__ __forceinline__ void Init(
-            LevelIteratorT  d_levels,               // Pointer to levels array
-            int             num_output_levels)      // Number of levels in array
-        {
-            this->d_levels          = d_levels;
-            this->num_output_levels = num_output_levels;
-        }
-
-        // Method for converting samples to bin-ids
-        template <CacheLoadModifier LOAD_MODIFIER, typename _SampleT>
-        __host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid)
-        {
-            /// Level iterator wrapper type
-            typedef typename If<IsPointer<LevelIteratorT>::VALUE,
-                    CacheModifiedInputIterator<LOAD_MODIFIER, LevelT, OffsetT>,     // Wrap the native input pointer with CacheModifiedInputIterator
-                    LevelIteratorT>::Type                                           // Directly use the supplied input iterator type
-                WrappedLevelIteratorT;
-
-            WrappedLevelIteratorT wrapped_levels(d_levels);
-
-            int num_bins = num_output_levels - 1;
-            if (valid)
-            {
-                bin = UpperBound(wrapped_levels, num_output_levels, (LevelT) sample) - 1;
-                if (bin >= num_bins)
-                    bin = -1;
-            }
-        }
-    };
-
-
-    // Scales samples to evenly-spaced bins
-    struct ScaleTransform
-    {
-        int    num_bins;    // Number of levels in array
-        LevelT max;         // Max sample level (exclusive)
-        LevelT min;         // Min sample level (inclusive)
-        LevelT scale;       // Bin scaling factor
-
-        // Initializer
-        template <typename _LevelT>
-        __host__ __device__ __forceinline__ void Init(
-            int     num_output_levels,  // Number of levels in array
-            _LevelT max,                // Max sample level (exclusive)
-            _LevelT min,                // Min sample level (inclusive)
-            _LevelT scale)              // Bin scaling factor
-        {
-            this->num_bins = num_output_levels - 1;
-            this->max = max;
-            this->min = min;
-            this->scale = scale;
-        }
-
-        // Initializer (float specialization)
-        __host__ __device__ __forceinline__ void Init(
-            int    num_output_levels,   // Number of levels in array
-            float   max,                // Max sample level (exclusive)
-            float   min,                // Min sample level (inclusive)
-            float   scale)              // Bin scaling factor
-        {
-            this->num_bins = num_output_levels - 1;
-            this->max = max;
-            this->min = min;
-            this->scale = float(1.0) / scale;
-        }
-
-        // Initializer (double specialization)
-        __host__ __device__ __forceinline__ void Init(
-            int    num_output_levels,   // Number of levels in array
-            double max,                 // Max sample level (exclusive)
-            double min,                 // Min sample level (inclusive)
-            double scale)               // Bin scaling factor
-        {
-            this->num_bins = num_output_levels - 1;
-            this->max = max;
-            this->min = min;
-            this->scale = double(1.0) / scale;
-        }
-
-        // Method for converting samples to bin-ids
-        template <CacheLoadModifier LOAD_MODIFIER, typename _SampleT>
-        __host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid)
-        {
-            LevelT level_sample = (LevelT) sample;
-
-            if (valid && (level_sample >= min) && (level_sample < max))
-                bin = (int) ((level_sample - min) / scale);
-        }
-
-        // Method for converting samples to bin-ids (float specialization)
-        template <CacheLoadModifier LOAD_MODIFIER>
-        __host__ __device__ __forceinline__ void BinSelect(float sample, int &bin, bool valid)
-        {
-            LevelT level_sample = (LevelT) sample;
-
-            if (valid && (level_sample >= min) && (level_sample < max))
-                bin = (int) ((level_sample - min) * scale);
-        }
-
-        // Method for converting samples to bin-ids (double specialization)
-        template <CacheLoadModifier LOAD_MODIFIER>
-        __host__ __device__ __forceinline__ void BinSelect(double sample, int &bin, bool valid)
-        {
-            LevelT level_sample = (LevelT) sample;
-
-            if (valid && (level_sample >= min) && (level_sample < max))
-                bin = (int) ((level_sample - min) * scale);
-        }
-    };
-
-
-    // Pass-through bin transform operator
-    struct PassThruTransform
-    {
-        // Method for converting samples to bin-ids
-        template <CacheLoadModifier LOAD_MODIFIER, typename _SampleT>
-        __host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid)
-        {
-            if (valid)
-                bin = (int) sample;
-        }
-    };
-
-
-
-    //---------------------------------------------------------------------
-    // Tuning policies
-    //---------------------------------------------------------------------
-
-    template <int NOMINAL_ITEMS_PER_THREAD>
-    struct TScale
-    {
-        enum
-        {
-            V_SCALE = (sizeof(SampleT) + sizeof(int) - 1) / sizeof(int),
-            VALUE   = CUB_MAX((NOMINAL_ITEMS_PER_THREAD / NUM_ACTIVE_CHANNELS / V_SCALE), 1)
-        };
-    };
-
-
-    /// SM11
-    struct Policy110
-    {
-        // HistogramSweepPolicy
-        typedef AgentHistogramPolicy<
-                512,
-                (NUM_CHANNELS == 1) ? 8 : 2,
-                BLOCK_LOAD_DIRECT,
-                LOAD_DEFAULT,
-                true,
-                GMEM,
-                false>
-            HistogramSweepPolicy;
-    };
-
-    /// SM20
-    struct Policy200
-    {
-        // HistogramSweepPolicy
-        typedef AgentHistogramPolicy<
-                (NUM_CHANNELS == 1) ? 256 : 128,
-                (NUM_CHANNELS == 1) ? 8 : 3,
-                (NUM_CHANNELS == 1) ? BLOCK_LOAD_DIRECT : BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                true,
-                SMEM,
-                false>
-            HistogramSweepPolicy;
-    };
-
-    /// SM30
-    struct Policy300
-    {
-        // HistogramSweepPolicy
-        typedef AgentHistogramPolicy<
-                512,
-                (NUM_CHANNELS == 1) ? 8 : 2,
-                BLOCK_LOAD_DIRECT,
-                LOAD_DEFAULT,
-                true,
-                GMEM,
-                false>
-            HistogramSweepPolicy;
-    };
-
-    /// SM35
-    struct Policy350
-    {
-        // HistogramSweepPolicy
-        typedef AgentHistogramPolicy<
-                128,
-                TScale<8>::VALUE,
-                BLOCK_LOAD_DIRECT,
-                LOAD_LDG,
-                true,
-                BLEND,
-                true>
-            HistogramSweepPolicy;
-    };
-
-    /// SM50
-    struct Policy500
-    {
-        // HistogramSweepPolicy
-        typedef AgentHistogramPolicy<
-                384,
-                TScale<16>::VALUE,
-                BLOCK_LOAD_DIRECT,
-                LOAD_LDG,
-                true,
-                SMEM,
-                false>
-            HistogramSweepPolicy;
-    };
-
-
-
-    //---------------------------------------------------------------------
-    // Tuning policies of current PTX compiler pass
-    //---------------------------------------------------------------------
-
-#if (CUB_PTX_ARCH >= 500)
-    typedef Policy500 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 350)
-    typedef Policy350 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 300)
-    typedef Policy300 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 200)
-    typedef Policy200 PtxPolicy;
-
-#else
-    typedef Policy110 PtxPolicy;
-
-#endif
-
-    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
-    struct PtxHistogramSweepPolicy : PtxPolicy::HistogramSweepPolicy {};
-
-
-    //---------------------------------------------------------------------
-    // Utilities
-    //---------------------------------------------------------------------
-
-    /**
-     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
-     */
-    template <typename KernelConfig>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t InitConfigs(
-        int             ptx_version,
-        KernelConfig    &histogram_sweep_config)
-    {
-    #if (CUB_PTX_ARCH > 0)
-
-        // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
-        return histogram_sweep_config.template Init<PtxHistogramSweepPolicy>();
-
-    #else
-
-        // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
-        if (ptx_version >= 500)
-        {
-            return histogram_sweep_config.template Init<typename Policy500::HistogramSweepPolicy>();
-        }
-        else if (ptx_version >= 350)
-        {
-            return histogram_sweep_config.template Init<typename Policy350::HistogramSweepPolicy>();
-        }
-        else if (ptx_version >= 300)
-        {
-            return histogram_sweep_config.template Init<typename Policy300::HistogramSweepPolicy>();
-        }
-        else if (ptx_version >= 200)
-        {
-            return histogram_sweep_config.template Init<typename Policy200::HistogramSweepPolicy>();
-        }
-        else if (ptx_version >= 110)
-        {
-            return histogram_sweep_config.template Init<typename Policy110::HistogramSweepPolicy>();
-        }
-        else
-        {
-            // No global atomic support
-            return cudaErrorNotSupported;
-        }
-
-    #endif
-    }
-
-
-    /**
-     * Kernel kernel dispatch configuration
-     */
-    struct KernelConfig
-    {
-        int                             block_threads;
-        int                             pixels_per_thread;
-
-        template <typename BlockPolicy>
-        CUB_RUNTIME_FUNCTION __forceinline__
-        cudaError_t Init()
-        {
-            block_threads               = BlockPolicy::BLOCK_THREADS;
-            pixels_per_thread           = BlockPolicy::PIXELS_PER_THREAD;
-
-            return cudaSuccess;
-        }
-    };
-
-
-    //---------------------------------------------------------------------
-    // Dispatch entrypoints
-    //---------------------------------------------------------------------
-
-    /**
-     * Privatization-based dispatch routine
-     */
-    template <
-        typename                            PrivatizedDecodeOpT,                            ///< The transform operator type for determining privatized counter indices from samples, one for each channel
-        typename                            OutputDecodeOpT,                                ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel
-        typename                            DeviceHistogramInitKernelT,                     ///< Function type of cub::DeviceHistogramInitKernel
-        typename                            DeviceHistogramSweepKernelT>                    ///< Function type of cub::DeviceHistogramSweepKernel
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t PrivatizedDispatch(
-        void*                               d_temp_storage,                                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&                             temp_storage_bytes,                             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        SampleIteratorT                     d_samples,                                      ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
-        CounterT*                           d_output_histograms[NUM_ACTIVE_CHANNELS],       ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1.
-        int                                 num_privatized_levels[NUM_ACTIVE_CHANNELS],     ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
-        PrivatizedDecodeOpT                 privatized_decode_op[NUM_ACTIVE_CHANNELS],      ///< [in] Transform operators for determining bin-ids from samples, one for each channel
-        int                                 num_output_levels[NUM_ACTIVE_CHANNELS],         ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
-        OutputDecodeOpT                     output_decode_op[NUM_ACTIVE_CHANNELS],          ///< [in] Transform operators for determining bin-ids from samples, one for each channel
-        int                                 max_num_output_bins,                            ///< [in] Maximum number of output bins in any channel
-        OffsetT                             num_row_pixels,                                 ///< [in] The number of multi-channel pixels per row in the region of interest
-        OffsetT                             num_rows,                                       ///< [in] The number of rows in the region of interest
-        OffsetT                             row_stride_samples,                             ///< [in] The number of samples between starts of consecutive rows in the region of interest
-        DeviceHistogramInitKernelT          histogram_init_kernel,                          ///< [in] Kernel function pointer to parameterization of cub::DeviceHistogramInitKernel
-        DeviceHistogramSweepKernelT         histogram_sweep_kernel,                         ///< [in] Kernel function pointer to parameterization of cub::DeviceHistogramSweepKernel
-        KernelConfig                        histogram_sweep_config,                         ///< [in] Dispatch parameters that match the policy that \p histogram_sweep_kernel was compiled for
-        cudaStream_t                        stream,                                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                                debug_synchronous)                              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-    #ifndef CUB_RUNTIME_ENABLED
-
-        // Kernel launch not supported from this device
-        return CubDebug(cudaErrorNotSupported);
-
-    #else
-
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get device ordinal
-            int device_ordinal;
-            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
-
-            // Get SM count
-            int sm_count;
-            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
-
-            // Get SM occupancy for histogram_sweep_kernel
-            int histogram_sweep_sm_occupancy;
-            if (CubDebug(error = MaxSmOccupancy(
-                histogram_sweep_sm_occupancy,
-                histogram_sweep_kernel,
-                histogram_sweep_config.block_threads))) break;
-
-            // Get device occupancy for histogram_sweep_kernel
-            int histogram_sweep_occupancy = histogram_sweep_sm_occupancy * sm_count;
-
-            if (num_row_pixels * NUM_CHANNELS == row_stride_samples)
-            {
-                // Treat as a single linear array of samples
-                num_row_pixels      *= num_rows;
-                num_rows            = 1;
-                row_stride_samples  = num_row_pixels * NUM_CHANNELS;
-            }
-
-            // Get grid dimensions, trying to keep total blocks ~histogram_sweep_occupancy
-            int pixels_per_tile     = histogram_sweep_config.block_threads * histogram_sweep_config.pixels_per_thread;
-            int tiles_per_row       = int(num_row_pixels + pixels_per_tile - 1) / pixels_per_tile;
-            int blocks_per_row      = CUB_MIN(histogram_sweep_occupancy, tiles_per_row);
-            int blocks_per_col      = (blocks_per_row > 0) ?
-                                        int(CUB_MIN(histogram_sweep_occupancy / blocks_per_row, num_rows)) :
-                                        0;
-            int num_thread_blocks   = blocks_per_row * blocks_per_col;
-
-            dim3 sweep_grid_dims;
-            sweep_grid_dims.x = (unsigned int) blocks_per_row;
-            sweep_grid_dims.y = (unsigned int) blocks_per_col;
-            sweep_grid_dims.z = 1;
-
-            // Temporary storage allocation requirements
-            const int   NUM_ALLOCATIONS = NUM_ACTIVE_CHANNELS + 1;
-            void*       allocations[NUM_ALLOCATIONS];
-            size_t      allocation_sizes[NUM_ALLOCATIONS];
-
-            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
-                allocation_sizes[CHANNEL] = size_t(num_thread_blocks) * (num_privatized_levels[CHANNEL] - 1) * sizeof(CounterT);
-
-            allocation_sizes[NUM_ALLOCATIONS - 1] = GridQueue<int>::AllocationSize();
-
-            // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
-            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
-            if (d_temp_storage == NULL)
-            {
-                // Return if the caller is simply requesting the size of the storage allocation
-                break;
-            }
-
-            // Construct the grid queue descriptor
-            GridQueue<int> tile_queue(allocations[NUM_ALLOCATIONS - 1]);
-
-            // Setup array wrapper for histogram channel output (because we can't pass static arrays as kernel parameters)
-            ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS> d_output_histograms_wrapper;
-            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
-                d_output_histograms_wrapper.array[CHANNEL] = d_output_histograms[CHANNEL];
-
-            // Setup array wrapper for privatized per-block histogram channel output (because we can't pass static arrays as kernel parameters)
-            ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS> d_privatized_histograms_wrapper;
-            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
-                d_privatized_histograms_wrapper.array[CHANNEL] = (CounterT*) allocations[CHANNEL];
-
-            // Setup array wrapper for sweep bin transforms (because we can't pass static arrays as kernel parameters)
-            ArrayWrapper<PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS> privatized_decode_op_wrapper;
-            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
-                privatized_decode_op_wrapper.array[CHANNEL] = privatized_decode_op[CHANNEL];
-
-            // Setup array wrapper for aggregation bin transforms (because we can't pass static arrays as kernel parameters)
-            ArrayWrapper<OutputDecodeOpT, NUM_ACTIVE_CHANNELS> output_decode_op_wrapper;
-            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
-                output_decode_op_wrapper.array[CHANNEL] = output_decode_op[CHANNEL];
-
-            // Setup array wrapper for num privatized bins (because we can't pass static arrays as kernel parameters)
-            ArrayWrapper<int, NUM_ACTIVE_CHANNELS> num_privatized_bins_wrapper;
-            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
-                num_privatized_bins_wrapper.array[CHANNEL] = num_privatized_levels[CHANNEL] - 1;
-
-            // Setup array wrapper for num output bins (because we can't pass static arrays as kernel parameters)
-            ArrayWrapper<int, NUM_ACTIVE_CHANNELS> num_output_bins_wrapper;
-            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
-                num_output_bins_wrapper.array[CHANNEL] = num_output_levels[CHANNEL] - 1;
-
-            int histogram_init_block_threads    = 256;
-            int histogram_init_grid_dims        = (max_num_output_bins + histogram_init_block_threads - 1) / histogram_init_block_threads;
-
-            // Log DeviceHistogramInitKernel configuration
-            if (debug_synchronous) _CubLog("Invoking DeviceHistogramInitKernel<<<%d, %d, 0, %lld>>>()\n",
-                histogram_init_grid_dims, histogram_init_block_threads, (long long) stream);
-
-            // Invoke histogram_init_kernel
-            histogram_init_kernel<<<histogram_init_grid_dims, histogram_init_block_threads, 0, stream>>>(
-                num_output_bins_wrapper,
-                d_output_histograms_wrapper,
-                tile_queue);
-
-            // Return if empty problem
-            if ((blocks_per_row == 0) || (blocks_per_col == 0))
-                break;
-
-            // Log histogram_sweep_kernel configuration
-            if (debug_synchronous) _CubLog("Invoking histogram_sweep_kernel<<<{%d, %d, %d}, %d, 0, %lld>>>(), %d pixels per thread, %d SM occupancy\n",
-                sweep_grid_dims.x, sweep_grid_dims.y, sweep_grid_dims.z,
-                histogram_sweep_config.block_threads, (long long) stream, histogram_sweep_config.pixels_per_thread, histogram_sweep_sm_occupancy);
-
-            // Invoke histogram_sweep_kernel
-            histogram_sweep_kernel<<<sweep_grid_dims, histogram_sweep_config.block_threads, 0, stream>>>(
-                d_samples,
-                num_output_bins_wrapper,
-                num_privatized_bins_wrapper,
-                d_output_histograms_wrapper,
-                d_privatized_histograms_wrapper,
-                output_decode_op_wrapper,
-                privatized_decode_op_wrapper,
-                num_row_pixels,
-                num_rows,
-                row_stride_samples,
-                tiles_per_row,
-                tile_queue);
-
-            // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
-
-            // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-
-        }
-        while (0);
-
-        return error;
-
-    #endif // CUB_RUNTIME_ENABLED
-    }
-
-
-
-    /**
-     * Dispatch routine for HistogramRange, specialized for sample types larger than 8bit
-     */
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t DispatchRange(
-        void*               d_temp_storage,                                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&             temp_storage_bytes,                            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
-        CounterT*           d_output_histograms[NUM_ACTIVE_CHANNELS],      ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1.
-        int                 num_output_levels[NUM_ACTIVE_CHANNELS],     ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
-        LevelT              *d_levels[NUM_ACTIVE_CHANNELS],             ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
-        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
-        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
-        OffsetT             row_stride_samples,                         ///< [in] The number of samples between starts of consecutive rows in the region of interest
-        cudaStream_t        stream,                                     ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous,                          ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-        Int2Type<false>     is_byte_sample)                             ///< [in] Marker type indicating whether or not SampleT is a 8b type
-    {
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get PTX version
-            int ptx_version;
-    #if (CUB_PTX_ARCH == 0)
-            if (CubDebug(error = PtxVersion(ptx_version))) break;
-    #else
-            ptx_version = CUB_PTX_ARCH;
-    #endif
-
-            // Get kernel dispatch configurations
-            KernelConfig histogram_sweep_config;
-            if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config)))
-                break;
-
-            // Use the search transform op for converting samples to privatized bins
-            typedef SearchTransform<LevelT*> PrivatizedDecodeOpT;
-
-            // Use the pass-thru transform op for converting privatized bins to output bins
-            typedef PassThruTransform OutputDecodeOpT;
-
-            PrivatizedDecodeOpT     privatized_decode_op[NUM_ACTIVE_CHANNELS];
-            OutputDecodeOpT         output_decode_op[NUM_ACTIVE_CHANNELS];
-            int                     max_levels = num_output_levels[0];
-
-            for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
-            {
-                privatized_decode_op[channel].Init(d_levels[channel], num_output_levels[channel]);
-                if (num_output_levels[channel] > max_levels)
-                    max_levels = num_output_levels[channel];
-            }
-            int max_num_output_bins = max_levels - 1;
-
-            // Dispatch
-            if (max_num_output_bins > MAX_PRIVATIZED_SMEM_BINS)
-            {
-                // Too many bins to keep in shared memory.
-                const int PRIVATIZED_SMEM_BINS = 0;
-
-                if (CubDebug(error = PrivatizedDispatch(
-                    d_temp_storage,
-                    temp_storage_bytes,
-                    d_samples,
-                    d_output_histograms,
-                    num_output_levels,
-                    privatized_decode_op,
-                    num_output_levels,
-                    output_decode_op,
-                    max_num_output_bins,
-                    num_row_pixels,
-                    num_rows,
-                    row_stride_samples,
-                    DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
-                    DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
-                    histogram_sweep_config,
-                    stream,
-                    debug_synchronous))) break;
-            }
-            else
-            {
-                // Dispatch shared-privatized approach
-                const int PRIVATIZED_SMEM_BINS = MAX_PRIVATIZED_SMEM_BINS;
-
-                if (CubDebug(error = PrivatizedDispatch(
-                    d_temp_storage,
-                    temp_storage_bytes,
-                    d_samples,
-                    d_output_histograms,
-                    num_output_levels,
-                    privatized_decode_op,
-                    num_output_levels,
-                    output_decode_op,
-                    max_num_output_bins,
-                    num_row_pixels,
-                    num_rows,
-                    row_stride_samples,
-                    DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
-                    DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
-                    histogram_sweep_config,
-                    stream,
-                    debug_synchronous))) break;
-            }
-
-        } while (0);
-
-        return error;
-    }
-
-
-    /**
-     * Dispatch routine for HistogramRange, specialized for 8-bit sample types (computes 256-bin privatized histograms and then reduces to user-specified levels)
-     */
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t DispatchRange(
-        void*               d_temp_storage,                             ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&             temp_storage_bytes,                         ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
-        CounterT*           d_output_histograms[NUM_ACTIVE_CHANNELS],   ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1.
-        int                 num_output_levels[NUM_ACTIVE_CHANNELS],     ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
-        LevelT              *d_levels[NUM_ACTIVE_CHANNELS],             ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
-        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
-        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
-        OffsetT             row_stride_samples,                         ///< [in] The number of samples between starts of consecutive rows in the region of interest
-        cudaStream_t        stream,                                     ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous,                          ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-        Int2Type<true>      is_byte_sample)                             ///< [in] Marker type indicating whether or not SampleT is a 8b type
-    {
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get PTX version
-            int ptx_version;
-    #if (CUB_PTX_ARCH == 0)
-            if (CubDebug(error = PtxVersion(ptx_version))) break;
-    #else
-            ptx_version = CUB_PTX_ARCH;
-    #endif
-
-            // Get kernel dispatch configurations
-            KernelConfig histogram_sweep_config;
-            if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config)))
-                break;
-
-            // Use the pass-thru transform op for converting samples to privatized bins
-            typedef PassThruTransform PrivatizedDecodeOpT;
-
-            // Use the search transform op for converting privatized bins to output bins
-            typedef SearchTransform<LevelT*> OutputDecodeOpT;
-
-            int                         num_privatized_levels[NUM_ACTIVE_CHANNELS];
-            PrivatizedDecodeOpT         privatized_decode_op[NUM_ACTIVE_CHANNELS];
-            OutputDecodeOpT             output_decode_op[NUM_ACTIVE_CHANNELS];
-            int                         max_levels = num_output_levels[0];              // Maximum number of levels in any channel
-
-            for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
-            {
-                num_privatized_levels[channel] = 257;
-                output_decode_op[channel].Init(d_levels[channel], num_output_levels[channel]);
-
-                if (num_output_levels[channel] > max_levels)
-                    max_levels = num_output_levels[channel];
-            }
-            int max_num_output_bins = max_levels - 1;
-
-            const int PRIVATIZED_SMEM_BINS = 256;
-
-            if (CubDebug(error = PrivatizedDispatch(
-                d_temp_storage,
-                temp_storage_bytes,
-                d_samples,
-                d_output_histograms,
-                num_privatized_levels,
-                privatized_decode_op,
-                num_output_levels,
-                output_decode_op,
-                max_num_output_bins,
-                num_row_pixels,
-                num_rows,
-                row_stride_samples,
-                DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
-                DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
-                histogram_sweep_config,
-                stream,
-                debug_synchronous))) break;
-
-        } while (0);
-
-        return error;
-    }
-
-
-    /**
-     * Dispatch routine for HistogramEven, specialized for sample types larger than 8-bit
-     */
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t DispatchEven(
-        void*               d_temp_storage,                            ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
-        CounterT*           d_output_histograms[NUM_ACTIVE_CHANNELS],  ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1.
-        int                 num_output_levels[NUM_ACTIVE_CHANNELS],     ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
-        LevelT              lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
-        LevelT              upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
-        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
-        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
-        OffsetT             row_stride_samples,                         ///< [in] The number of samples between starts of consecutive rows in the region of interest
-        cudaStream_t        stream,                                     ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous,                          ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-        Int2Type<false>     is_byte_sample)                             ///< [in] Marker type indicating whether or not SampleT is a 8b type
-    {
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get PTX version
-            int ptx_version;
-    #if (CUB_PTX_ARCH == 0)
-            if (CubDebug(error = PtxVersion(ptx_version))) break;
-    #else
-            ptx_version = CUB_PTX_ARCH;
-    #endif
-
-            // Get kernel dispatch configurations
-            KernelConfig histogram_sweep_config;
-            if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config)))
-                break;
-
-            // Use the scale transform op for converting samples to privatized bins
-            typedef ScaleTransform PrivatizedDecodeOpT;
-
-            // Use the pass-thru transform op for converting privatized bins to output bins
-            typedef PassThruTransform OutputDecodeOpT;
-
-            PrivatizedDecodeOpT         privatized_decode_op[NUM_ACTIVE_CHANNELS];
-            OutputDecodeOpT             output_decode_op[NUM_ACTIVE_CHANNELS];
-            int                         max_levels = num_output_levels[0];
-
-            for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
-            {
-                int     bins    = num_output_levels[channel] - 1;
-                LevelT  scale   = (upper_level[channel] - lower_level[channel]) / bins;
-
-                privatized_decode_op[channel].Init(num_output_levels[channel], upper_level[channel], lower_level[channel], scale);
-
-                if (num_output_levels[channel] > max_levels)
-                    max_levels = num_output_levels[channel];
-            }
-            int max_num_output_bins = max_levels - 1;
-
-            if (max_num_output_bins > MAX_PRIVATIZED_SMEM_BINS)
-            {
-                // Dispatch shared-privatized approach
-                const int PRIVATIZED_SMEM_BINS = 0;
-
-                if (CubDebug(error = PrivatizedDispatch(
-                    d_temp_storage,
-                    temp_storage_bytes,
-                    d_samples,
-                    d_output_histograms,
-                    num_output_levels,
-                    privatized_decode_op,
-                    num_output_levels,
-                    output_decode_op,
-                    max_num_output_bins,
-                    num_row_pixels,
-                    num_rows,
-                    row_stride_samples,
-                    DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
-                    DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
-                    histogram_sweep_config,
-                    stream,
-                    debug_synchronous))) break;
-            }
-            else
-            {
-                // Dispatch shared-privatized approach
-                const int PRIVATIZED_SMEM_BINS = MAX_PRIVATIZED_SMEM_BINS;
-
-                if (CubDebug(error = PrivatizedDispatch(
-                    d_temp_storage,
-                    temp_storage_bytes,
-                    d_samples,
-                    d_output_histograms,
-                    num_output_levels,
-                    privatized_decode_op,
-                    num_output_levels,
-                    output_decode_op,
-                    max_num_output_bins,
-                    num_row_pixels,
-                    num_rows,
-                    row_stride_samples,
-                    DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
-                    DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
-                    histogram_sweep_config,
-                    stream,
-                    debug_synchronous))) break;
-            }
-        }
-        while (0);
-
-        return error;
-    }
-
-
-    /**
-     * Dispatch routine for HistogramEven, specialized for 8-bit sample types (computes 256-bin privatized histograms and then reduces to user-specified levels)
-     */
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t DispatchEven(
-        void*               d_temp_storage,                            ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
-        CounterT*           d_output_histograms[NUM_ACTIVE_CHANNELS],  ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1.
-        int                 num_output_levels[NUM_ACTIVE_CHANNELS],     ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
-        LevelT              lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
-        LevelT              upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
-        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
-        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
-        OffsetT             row_stride_samples,                         ///< [in] The number of samples between starts of consecutive rows in the region of interest
-        cudaStream_t        stream,                                     ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous,                          ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-        Int2Type<true>      is_byte_sample)                             ///< [in] Marker type indicating whether or not SampleT is a 8b type
-    {
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get PTX version
-            int ptx_version;
-    #if (CUB_PTX_ARCH == 0)
-            if (CubDebug(error = PtxVersion(ptx_version))) break;
-    #else
-            ptx_version = CUB_PTX_ARCH;
-    #endif
-
-            // Get kernel dispatch configurations
-            KernelConfig histogram_sweep_config;
-            if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config)))
-                break;
-
-            // Use the pass-thru transform op for converting samples to privatized bins
-            typedef PassThruTransform PrivatizedDecodeOpT;
-
-            // Use the scale transform op for converting privatized bins to output bins
-            typedef ScaleTransform OutputDecodeOpT;
-
-            int                     num_privatized_levels[NUM_ACTIVE_CHANNELS];
-            PrivatizedDecodeOpT     privatized_decode_op[NUM_ACTIVE_CHANNELS];
-            OutputDecodeOpT         output_decode_op[NUM_ACTIVE_CHANNELS];
-            int                     max_levels = num_output_levels[0];
-
-            for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
-            {
-                num_privatized_levels[channel] = 257;
-
-                int     bins    = num_output_levels[channel] - 1;
-                LevelT  scale   = (upper_level[channel] - lower_level[channel]) / bins;
-                output_decode_op[channel].Init(num_output_levels[channel], upper_level[channel], lower_level[channel], scale);
-
-                if (num_output_levels[channel] > max_levels)
-                    max_levels = num_output_levels[channel];
-            }
-            int max_num_output_bins = max_levels - 1;
-
-            const int PRIVATIZED_SMEM_BINS = 256;
-
-            if (CubDebug(error = PrivatizedDispatch(
-                d_temp_storage,
-                temp_storage_bytes,
-                d_samples,
-                d_output_histograms,
-                num_privatized_levels,
-                privatized_decode_op,
-                num_output_levels,
-                output_decode_op,
-                max_num_output_bins,
-                num_row_pixels,
-                num_rows,
-                row_stride_samples,
-                DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
-                DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
-                histogram_sweep_config,
-                stream,
-                debug_synchronous))) break;
-
-        }
-        while (0);
-
-        return error;
-    }
-
-};
-
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/thrust/system/cuda/detail/cub/device/dispatch/dispatch_radix_sort.cuh b/thrust/system/cuda/detail/cub/device/dispatch/dispatch_radix_sort.cuh
deleted file mode 100644
index baf7f422c..000000000
--- a/thrust/system/cuda/detail/cub/device/dispatch/dispatch_radix_sort.cuh
+++ /dev/null
@@ -1,1619 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within device-accessible memory.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-
-#include "../../agent/agent_radix_sort_upsweep.cuh"
-#include "../../agent/agent_radix_sort_downsweep.cuh"
-#include "../../agent/agent_scan.cuh"
-#include "../../block/block_radix_sort.cuh"
-#include "../../grid/grid_even_share.cuh"
-#include "../../util_type.cuh"
-#include "../../util_debug.cuh"
-#include "../../util_device.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/******************************************************************************
- * Kernel entry points
- *****************************************************************************/
-
-/**
- * Upsweep digit-counting kernel entry point (multi-block).  Computes privatized digit histograms, one per block.
- */
-template <
-    typename                ChainedPolicyT,                 ///< Chained tuning policy
-    bool                    ALT_DIGIT_BITS,                 ///< Whether or not to use the alternate (lower-bits) policy
-    bool                    IS_DESCENDING,                  ///< Whether or not the sorted-order is high-to-low
-    typename                KeyT,                           ///< Key type
-    typename                OffsetT>                        ///< Signed integer type for global offsets
-__launch_bounds__ (int((ALT_DIGIT_BITS) ?
-    ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::BLOCK_THREADS :
-    ChainedPolicyT::ActivePolicy::UpsweepPolicy::BLOCK_THREADS))
-__global__ void DeviceRadixSortUpsweepKernel(
-    const KeyT              *d_keys,                        ///< [in] Input keys buffer
-    OffsetT                 *d_spine,                       ///< [out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.)
-    OffsetT                 /*num_items*/,                  ///< [in] Total number of input data items
-    int                     current_bit,                    ///< [in] Bit position of current radix digit
-    int                     num_bits,                       ///< [in] Number of bits of current radix digit
-    GridEvenShare<OffsetT>  even_share)                     ///< [in] Even-share descriptor for mapan equal number of tiles onto each thread block
-{
-    enum {
-        TILE_ITEMS = ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::BLOCK_THREADS *
-                        ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::ITEMS_PER_THREAD
-    };
-
-    // Parameterize AgentRadixSortUpsweep type for the current configuration
-    typedef AgentRadixSortUpsweep<
-            typename If<(ALT_DIGIT_BITS),
-                typename ChainedPolicyT::ActivePolicy::AltUpsweepPolicy,
-                typename ChainedPolicyT::ActivePolicy::UpsweepPolicy>::Type,
-            KeyT,
-            OffsetT>
-        AgentRadixSortUpsweepT;
-
-    // Shared memory storage
-    __shared__ typename AgentRadixSortUpsweepT::TempStorage temp_storage;
-
-    // Initialize GRID_MAPPING_RAKE even-share descriptor for this thread block
-    even_share.template BlockInit<TILE_ITEMS, GRID_MAPPING_RAKE>();
-
-    AgentRadixSortUpsweepT upsweep(temp_storage, d_keys, current_bit, num_bits);
-
-    upsweep.ProcessRegion(even_share.block_offset, even_share.block_end);
-
-    CTA_SYNC();
-
-    // Write out digit counts (striped)
-    upsweep.template ExtractCounts<IS_DESCENDING>(d_spine, gridDim.x, blockIdx.x);
-}
-
-
-/**
- * Spine scan kernel entry point (single-block).  Computes an exclusive prefix sum over the privatized digit histograms
- */
-template <
-    typename                ChainedPolicyT,                 ///< Chained tuning policy
-    typename                OffsetT>                        ///< Signed integer type for global offsets
-__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ScanPolicy::BLOCK_THREADS), 1)
-__global__ void RadixSortScanBinsKernel(
-    OffsetT                 *d_spine,                       ///< [in,out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.)
-    int                     num_counts)                     ///< [in] Total number of bin-counts
-{
-    // Parameterize the AgentScan type for the current configuration
-    typedef AgentScan<
-            typename ChainedPolicyT::ActivePolicy::ScanPolicy,
-            OffsetT*,
-            OffsetT*,
-            cub::Sum,
-            OffsetT,
-            OffsetT>
-        AgentScanT;
-
-    // Shared memory storage
-    __shared__ typename AgentScanT::TempStorage temp_storage;
-
-    // Block scan instance
-    AgentScanT block_scan(temp_storage, d_spine, d_spine, cub::Sum(), OffsetT(0)) ;
-
-    // Process full input tiles
-    int block_offset = 0;
-    BlockScanRunningPrefixOp<OffsetT, Sum> prefix_op(0, Sum());
-    while (block_offset + AgentScanT::TILE_ITEMS <= num_counts)
-    {
-        block_scan.template ConsumeTile<false, false>(block_offset, prefix_op);
-        block_offset += AgentScanT::TILE_ITEMS;
-    }
-}
-
-
-/**
- * Downsweep pass kernel entry point (multi-block).  Scatters keys (and values) into corresponding bins for the current digit place.
- */
-template <
-    typename                ChainedPolicyT,                 ///< Chained tuning policy
-    bool                    ALT_DIGIT_BITS,                 ///< Whether or not to use the alternate (lower-bits) policy
-    bool                    IS_DESCENDING,                  ///< Whether or not the sorted-order is high-to-low
-    typename                KeyT,                           ///< Key type
-    typename                ValueT,                         ///< Value type
-    typename                OffsetT>                        ///< Signed integer type for global offsets
-__launch_bounds__ (int((ALT_DIGIT_BITS) ?
-    ChainedPolicyT::ActivePolicy::AltDownsweepPolicy::BLOCK_THREADS :
-    ChainedPolicyT::ActivePolicy::DownsweepPolicy::BLOCK_THREADS))
-__global__ void DeviceRadixSortDownsweepKernel(
-    const KeyT              *d_keys_in,                     ///< [in] Input keys buffer
-    KeyT                    *d_keys_out,                    ///< [in] Output keys buffer
-    const ValueT            *d_values_in,                   ///< [in] Input values buffer
-    ValueT                  *d_values_out,                  ///< [in] Output values buffer
-    OffsetT                 *d_spine,                       ///< [in] Scan of privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.)
-    OffsetT                 num_items,                      ///< [in] Total number of input data items
-    int                     current_bit,                    ///< [in] Bit position of current radix digit
-    int                     num_bits,                       ///< [in] Number of bits of current radix digit
-    GridEvenShare<OffsetT>  even_share)                     ///< [in] Even-share descriptor for mapan equal number of tiles onto each thread block
-{
-    enum {
-        TILE_ITEMS = ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::BLOCK_THREADS *
-                        ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::ITEMS_PER_THREAD
-    };
-
-    // Parameterize AgentRadixSortDownsweep type for the current configuration
-    typedef AgentRadixSortDownsweep<
-            typename If<(ALT_DIGIT_BITS),
-                typename ChainedPolicyT::ActivePolicy::AltDownsweepPolicy,
-                typename ChainedPolicyT::ActivePolicy::DownsweepPolicy>::Type,
-            IS_DESCENDING,
-            KeyT,
-            ValueT,
-            OffsetT>
-        AgentRadixSortDownsweepT;
-
-    // Shared memory storage
-    __shared__  typename AgentRadixSortDownsweepT::TempStorage temp_storage;
-
-    // Initialize even-share descriptor for this thread block
-    even_share.template BlockInit<TILE_ITEMS, GRID_MAPPING_RAKE>();
-
-    // Process input tiles
-    AgentRadixSortDownsweepT(temp_storage, num_items, d_spine, d_keys_in, d_keys_out, d_values_in, d_values_out, current_bit, num_bits).ProcessRegion(
-        even_share.block_offset,
-        even_share.block_end);
-}
-
-
-/**
- * Single pass kernel entry point (single-block).  Fully sorts a tile of input.
- */
-template <
-    typename                ChainedPolicyT,                 ///< Chained tuning policy
-    bool                    IS_DESCENDING,                  ///< Whether or not the sorted-order is high-to-low
-    typename                KeyT,                           ///< Key type
-    typename                ValueT,                         ///< Value type
-    typename                OffsetT>                        ///< Signed integer type for global offsets
-__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1)
-__global__ void DeviceRadixSortSingleTileKernel(
-    const KeyT              *d_keys_in,                     ///< [in] Input keys buffer
-    KeyT                    *d_keys_out,                    ///< [in] Output keys buffer
-    const ValueT            *d_values_in,                   ///< [in] Input values buffer
-    ValueT                  *d_values_out,                  ///< [in] Output values buffer
-    OffsetT                 num_items,                      ///< [in] Total number of input data items
-    int                     current_bit,                    ///< [in] Bit position of current radix digit
-    int                     end_bit)                        ///< [in] The past-the-end (most-significant) bit index needed for key comparison
-{
-    // Constants
-    enum
-    {
-        BLOCK_THREADS           = ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS,
-        ITEMS_PER_THREAD        = ChainedPolicyT::ActivePolicy::SingleTilePolicy::ITEMS_PER_THREAD,
-        KEYS_ONLY               = Equals<ValueT, NullType>::VALUE,
-    };
-
-    // BlockRadixSort type
-    typedef BlockRadixSort<
-            KeyT,
-            BLOCK_THREADS,
-            ITEMS_PER_THREAD,
-            ValueT,
-            ChainedPolicyT::ActivePolicy::SingleTilePolicy::RADIX_BITS,
-            (ChainedPolicyT::ActivePolicy::SingleTilePolicy::RANK_ALGORITHM == RADIX_RANK_MEMOIZE),
-            ChainedPolicyT::ActivePolicy::SingleTilePolicy::SCAN_ALGORITHM>
-        BlockRadixSortT;
-
-    // BlockLoad type (keys)
-    typedef BlockLoad<
-        KeyT,
-        BLOCK_THREADS,
-        ITEMS_PER_THREAD,
-        ChainedPolicyT::ActivePolicy::SingleTilePolicy::LOAD_ALGORITHM> BlockLoadKeys;
-
-    // BlockLoad type (values)
-    typedef BlockLoad<
-        ValueT,
-        BLOCK_THREADS,
-        ITEMS_PER_THREAD,
-        ChainedPolicyT::ActivePolicy::SingleTilePolicy::LOAD_ALGORITHM> BlockLoadValues;
-
-    // Unsigned word for key bits
-    typedef typename Traits<KeyT>::UnsignedBits UnsignedBitsT;
-
-    // Shared memory storage
-    __shared__ union TempStorage
-    {
-        typename BlockRadixSortT::TempStorage       sort;
-        typename BlockLoadKeys::TempStorage         load_keys;
-        typename BlockLoadValues::TempStorage       load_values;
-
-    } temp_storage;
-
-    // Keys and values for the block
-    KeyT            keys[ITEMS_PER_THREAD];
-    ValueT          values[ITEMS_PER_THREAD];
-
-    // Get default (min/max) value for out-of-bounds keys
-    UnsignedBitsT   default_key_bits = (IS_DESCENDING) ? Traits<KeyT>::LOWEST_KEY : Traits<KeyT>::MAX_KEY;
-    KeyT            default_key = reinterpret_cast<KeyT&>(default_key_bits);
-
-    // Load keys
-    BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in, keys, num_items, default_key);
-
-    CTA_SYNC();
-
-    // Load values
-    if (!KEYS_ONLY)
-    {
-        // Register pressure work-around: moving num_items through shfl prevents compiler
-        // from reusing guards/addressing from prior guarded loads
-        num_items = ShuffleIndex(num_items, 0, CUB_PTX_WARP_THREADS, 0xffffffff);
-
-        BlockLoadValues(temp_storage.load_values).Load(d_values_in, values, num_items);
-
-        CTA_SYNC();
-    }
-
-    // Sort tile
-    BlockRadixSortT(temp_storage.sort).SortBlockedToStriped(
-        keys,
-        values,
-        current_bit,
-        end_bit,
-        Int2Type<IS_DESCENDING>(),
-        Int2Type<KEYS_ONLY>());
-
-    // Store keys and values
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-    {
-        int item_offset = ITEM * BLOCK_THREADS + threadIdx.x;
-        if (item_offset < num_items)
-        {
-            d_keys_out[item_offset] = keys[ITEM];
-            if (!KEYS_ONLY)
-                d_values_out[item_offset] = values[ITEM];
-        }
-    }
-}
-
-
-/**
- * Segmented radix sorting pass (one block per segment)
- */
-template <
-    typename                ChainedPolicyT,                 ///< Chained tuning policy
-    bool                    ALT_DIGIT_BITS,                 ///< Whether or not to use the alternate (lower-bits) policy
-    bool                    IS_DESCENDING,                  ///< Whether or not the sorted-order is high-to-low
-    typename                KeyT,                           ///< Key type
-    typename                ValueT,                         ///< Value type
-    typename                OffsetIteratorT,                ///< Random-access input iterator type for reading segment offsets \iterator
-    typename                OffsetT>                        ///< Signed integer type for global offsets
-__launch_bounds__ (int((ALT_DIGIT_BITS) ?
-    ChainedPolicyT::ActivePolicy::AltSegmentedPolicy::BLOCK_THREADS :
-    ChainedPolicyT::ActivePolicy::SegmentedPolicy::BLOCK_THREADS))
-__global__ void DeviceSegmentedRadixSortKernel(
-    const KeyT              *d_keys_in,                     ///< [in] Input keys buffer
-    KeyT                    *d_keys_out,                    ///< [in] Output keys buffer
-    const ValueT            *d_values_in,                   ///< [in] Input values buffer
-    ValueT                  *d_values_out,                  ///< [in] Output values buffer
-    OffsetIteratorT         d_begin_offsets,                ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-    OffsetIteratorT         d_end_offsets,                  ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-    int                     /*num_segments*/,               ///< [in] The number of segments that comprise the sorting data
-    int                     current_bit,                    ///< [in] Bit position of current radix digit
-    int                     pass_bits)                      ///< [in] Number of bits of current radix digit
-{
-    //
-    // Constants
-    //
-
-    typedef typename If<(ALT_DIGIT_BITS),
-        typename ChainedPolicyT::ActivePolicy::AltSegmentedPolicy,
-        typename ChainedPolicyT::ActivePolicy::SegmentedPolicy>::Type SegmentedPolicyT;
-
-    enum
-    {
-        BLOCK_THREADS       = SegmentedPolicyT::BLOCK_THREADS,
-        ITEMS_PER_THREAD    = SegmentedPolicyT::ITEMS_PER_THREAD,
-        RADIX_BITS          = SegmentedPolicyT::RADIX_BITS,
-        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
-        RADIX_DIGITS        = 1 << RADIX_BITS,
-        KEYS_ONLY           = Equals<ValueT, NullType>::VALUE,
-    };
-
-    // Upsweep type
-    typedef AgentRadixSortUpsweep<
-            AgentRadixSortUpsweepPolicy<BLOCK_THREADS, ITEMS_PER_THREAD, SegmentedPolicyT::LOAD_MODIFIER, RADIX_BITS>,
-            KeyT,
-            OffsetT>
-        BlockUpsweepT;
-
-    // Digit-scan type
-    typedef BlockScan<OffsetT, BLOCK_THREADS> DigitScanT;
-
-    // Downsweep type
-    typedef AgentRadixSortDownsweep<SegmentedPolicyT, IS_DESCENDING, KeyT, ValueT, OffsetT> BlockDownsweepT;
-
-    enum
-    {
-        /// Number of bin-starting offsets tracked per thread
-        BINS_TRACKED_PER_THREAD = BlockDownsweepT::BINS_TRACKED_PER_THREAD
-    };
-
-    //
-    // Process input tiles
-    //
-
-    // Shared memory storage
-    __shared__ union
-    {
-        typename BlockUpsweepT::TempStorage     upsweep;
-        typename BlockDownsweepT::TempStorage   downsweep;
-        struct
-        {
-            volatile OffsetT                        reverse_counts_in[RADIX_DIGITS];
-            volatile OffsetT                        reverse_counts_out[RADIX_DIGITS];
-            typename DigitScanT::TempStorage        scan;
-        };
-
-    } temp_storage;
-
-    OffsetT segment_begin   = d_begin_offsets[blockIdx.x];
-    OffsetT segment_end     = d_end_offsets[blockIdx.x];
-    OffsetT num_items       = segment_end - segment_begin;
-
-    // Check if empty segment
-    if (num_items <= 0)
-        return;
-
-    // Upsweep
-    BlockUpsweepT upsweep(temp_storage.upsweep, d_keys_in, current_bit, pass_bits);
-    upsweep.ProcessRegion(segment_begin, segment_end);
-
-    CTA_SYNC();
-
-    // The count of each digit value in this pass (valid in the first RADIX_DIGITS threads)
-    OffsetT bin_count[BINS_TRACKED_PER_THREAD];
-    upsweep.ExtractCounts(bin_count);
-
-    CTA_SYNC();
-
-    if (IS_DESCENDING)
-    {
-        // Reverse bin counts
-        #pragma unroll
-        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
-        {
-            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
-
-            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
-                temp_storage.reverse_counts_in[bin_idx] = bin_count[track];
-        }
-
-        CTA_SYNC();
-
-        #pragma unroll
-        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
-        {
-            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
-
-            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
-                bin_count[track] = temp_storage.reverse_counts_in[RADIX_DIGITS - bin_idx - 1];
-        }
-    }
-
-    // Scan
-    OffsetT bin_offset[BINS_TRACKED_PER_THREAD];     // The global scatter base offset for each digit value in this pass (valid in the first RADIX_DIGITS threads)
-    DigitScanT(temp_storage.scan).ExclusiveSum(bin_count, bin_offset);
-
-    #pragma unroll
-    for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
-    {
-        bin_offset[track] += segment_begin;
-    }
-
-    if (IS_DESCENDING)
-    {
-        // Reverse bin offsets
-        #pragma unroll
-        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
-        {
-            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
-
-            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
-                temp_storage.reverse_counts_out[threadIdx.x] = bin_offset[track];
-        }
-
-        CTA_SYNC();
-
-        #pragma unroll
-        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
-        {
-            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
-
-            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
-                bin_offset[track] = temp_storage.reverse_counts_out[RADIX_DIGITS - bin_idx - 1];
-        }
-    }
-
-    CTA_SYNC();
-
-    // Downsweep
-    BlockDownsweepT downsweep(temp_storage.downsweep, bin_offset, num_items, d_keys_in, d_keys_out, d_values_in, d_values_out, current_bit, pass_bits);
-    downsweep.ProcessRegion(segment_begin, segment_end);
-}
-
-
-
-/******************************************************************************
- * Policy
- ******************************************************************************/
-
-/**
- * Tuning policy for kernel specialization
- */
-template <
-    typename KeyT,          ///< Key type
-    typename ValueT,        ///< Value type
-    typename OffsetT>       ///< Signed integer type for global offsets
-struct DeviceRadixSortPolicy
-{
-    //------------------------------------------------------------------------------
-    // Constants
-    //------------------------------------------------------------------------------
-
-    enum
-    {
-        // Whether this is a keys-only (or key-value) sort
-        KEYS_ONLY = (Equals<ValueT, NullType>::VALUE),
-    };
-
-    // Dominant-sized key/value type
-    typedef typename If<(sizeof(ValueT) > 4) && (sizeof(KeyT) < sizeof(ValueT)), ValueT, KeyT>::Type DominantT;
-
-    //------------------------------------------------------------------------------
-    // Architecture-specific tuning policies
-    //------------------------------------------------------------------------------
-
-    /// SM20
-    struct Policy200 : ChainedPolicy<200, Policy200, Policy200>
-    {
-        enum {
-            PRIMARY_RADIX_BITS      = 5,
-            ALT_RADIX_BITS          = PRIMARY_RADIX_BITS - 1,
-
-            // Relative size of KeyT type to a 4-byte word
-            SCALE_FACTOR_4B = (CUB_MAX(sizeof(KeyT), sizeof(ValueT)) + 3) / 4,
-        };
-
-        // Keys-only upsweep policies
-        typedef AgentRadixSortUpsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR_4B), LOAD_DEFAULT, PRIMARY_RADIX_BITS>    UpsweepPolicyKeys;
-        typedef AgentRadixSortUpsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR_4B), LOAD_DEFAULT, ALT_RADIX_BITS>        AltUpsweepPolicyKeys;
-
-        // Key-value pairs upsweep policies
-        typedef AgentRadixSortUpsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR_4B), LOAD_DEFAULT, PRIMARY_RADIX_BITS>   UpsweepPolicyPairs;
-        typedef AgentRadixSortUpsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR_4B), LOAD_DEFAULT, ALT_RADIX_BITS>       AltUpsweepPolicyPairs;
-
-        // Upsweep policies
-        typedef typename If<KEYS_ONLY, UpsweepPolicyKeys, UpsweepPolicyPairs>::Type         UpsweepPolicy;
-        typedef typename If<KEYS_ONLY, AltUpsweepPolicyKeys, AltUpsweepPolicyPairs>::Type   AltUpsweepPolicy;
-
-        // Scan policy
-        typedef AgentScanPolicy <512, 4, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
-
-        // Keys-only downsweep policies
-        typedef AgentRadixSortDownsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>    DownsweepPolicyKeys;
-        typedef AgentRadixSortDownsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS>        AltDownsweepPolicyKeys;
-
-        // Key-value pairs downsweep policies
-        typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>   DownsweepPolicyPairs;
-        typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS>       AltDownsweepPolicyPairs;
-
-        // Downsweep policies
-        typedef typename If<KEYS_ONLY, DownsweepPolicyKeys, DownsweepPolicyPairs>::Type         DownsweepPolicy;
-        typedef typename If<KEYS_ONLY, AltDownsweepPolicyKeys, AltDownsweepPolicyPairs>::Type   AltDownsweepPolicy;
-
-        // Single-tile policy
-        typedef DownsweepPolicy SingleTilePolicy;
-
-        // Segmented policies
-        typedef DownsweepPolicy     SegmentedPolicy;
-        typedef AltDownsweepPolicy  AltSegmentedPolicy;
-    };
-
-    /// SM30
-    struct Policy300 : ChainedPolicy<300, Policy300, Policy200>
-    {
-        enum {
-            PRIMARY_RADIX_BITS      = 5,
-            ALT_RADIX_BITS          = PRIMARY_RADIX_BITS - 1,
-
-            // Relative size of KeyT type to a 4-byte word
-            SCALE_FACTOR_4B = (CUB_MAX(sizeof(KeyT), sizeof(ValueT)) + 3) / 4,
-        };
-
-        // Keys-only upsweep policies
-        typedef AgentRadixSortUpsweepPolicy <256, CUB_MAX(1, 7 / SCALE_FACTOR_4B), LOAD_DEFAULT, PRIMARY_RADIX_BITS>    UpsweepPolicyKeys;
-        typedef AgentRadixSortUpsweepPolicy <256, CUB_MAX(1, 7 / SCALE_FACTOR_4B), LOAD_DEFAULT, ALT_RADIX_BITS>        AltUpsweepPolicyKeys;
-
-        // Key-value pairs upsweep policies
-        typedef AgentRadixSortUpsweepPolicy <256, CUB_MAX(1, 5 / SCALE_FACTOR_4B), LOAD_DEFAULT, PRIMARY_RADIX_BITS>    UpsweepPolicyPairs;
-        typedef AgentRadixSortUpsweepPolicy <256, CUB_MAX(1, 5 / SCALE_FACTOR_4B), LOAD_DEFAULT, ALT_RADIX_BITS>        AltUpsweepPolicyPairs;
-
-        // Upsweep policies
-        typedef typename If<KEYS_ONLY, UpsweepPolicyKeys, UpsweepPolicyPairs>::Type         UpsweepPolicy;
-        typedef typename If<KEYS_ONLY, AltUpsweepPolicyKeys, AltUpsweepPolicyPairs>::Type   AltUpsweepPolicy;
-
-        // Scan policy
-        typedef AgentScanPolicy <1024, 4, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_WARP_SCANS> ScanPolicy;
-
-        // Keys-only downsweep policies
-        typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 14 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>   DownsweepPolicyKeys;
-        typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 14 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS>       AltDownsweepPolicyKeys;
-
-        // Key-value pairs downsweep policies
-        typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 10 / SCALE_FACTOR_4B), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>    DownsweepPolicyPairs;
-        typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 10 / SCALE_FACTOR_4B), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS>        AltDownsweepPolicyPairs;
-
-        // Downsweep policies
-        typedef typename If<KEYS_ONLY, DownsweepPolicyKeys, DownsweepPolicyPairs>::Type         DownsweepPolicy;
-        typedef typename If<KEYS_ONLY, AltDownsweepPolicyKeys, AltDownsweepPolicyPairs>::Type   AltDownsweepPolicy;
-
-        // Single-tile policy
-        typedef DownsweepPolicy SingleTilePolicy;
-
-        // Segmented policies
-        typedef DownsweepPolicy     SegmentedPolicy;
-        typedef AltDownsweepPolicy  AltSegmentedPolicy;
-    };
-
-
-    /// SM35
-    struct Policy350 : ChainedPolicy<350, Policy350, Policy300>
-    {
-        enum {
-            PRIMARY_RADIX_BITS      = (sizeof(KeyT) > 1) ? 6 : 5,    // 1.72B 32b keys/s, 1.17B 32b pairs/s, 1.55B 32b segmented keys/s (K40m)
-        };
-
-        // Scan policy
-        typedef AgentScanPolicy <1024, 4, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_WARP_SCANS> ScanPolicy;
-
-        // Keys-only downsweep policies
-        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(128, 9, DominantT), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_LDG, RADIX_RANK_MATCH, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicyKeys;
-        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(64, 18, DominantT), BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1> AltDownsweepPolicyKeys;
-
-        // Key-value pairs downsweep policies
-        typedef DownsweepPolicyKeys DownsweepPolicyPairs;
-        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(128, 15, DominantT), BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1> AltDownsweepPolicyPairs;
-
-        // Downsweep policies
-        typedef typename If<KEYS_ONLY, DownsweepPolicyKeys, DownsweepPolicyPairs>::Type DownsweepPolicy;
-        typedef typename If<KEYS_ONLY, AltDownsweepPolicyKeys, AltDownsweepPolicyPairs>::Type AltDownsweepPolicy;
-
-        // Upsweep policies
-        typedef DownsweepPolicy UpsweepPolicy;
-        typedef AltDownsweepPolicy AltUpsweepPolicy;
-
-        // Single-tile policy
-        typedef DownsweepPolicy SingleTilePolicy;
-
-        // Segmented policies
-        typedef DownsweepPolicy     SegmentedPolicy;
-        typedef AltDownsweepPolicy  AltSegmentedPolicy;
-
-
-    };
-
-
-    /// SM50
-    struct Policy500 : ChainedPolicy<500, Policy500, Policy350>
-    {
-        enum {
-            PRIMARY_RADIX_BITS      = (sizeof(KeyT) > 1) ? 7 : 5,    // 3.5B 32b keys/s, 1.92B 32b pairs/s (TitanX)
-            SINGLE_TILE_RADIX_BITS  = (sizeof(KeyT) > 1) ? 6 : 5,
-            SEGMENTED_RADIX_BITS    = (sizeof(KeyT) > 1) ? 6 : 5,    // 3.1B 32b segmented keys/s (TitanX)
-        };
-
-        // ScanPolicy
-        typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
-
-        // Downsweep policies
-        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(160, 39, DominantT),  BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>  DownsweepPolicy;
-        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(256, 16, DominantT),  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS - 1>   AltDownsweepPolicy;
-
-        // Upsweep policies
-        typedef DownsweepPolicy UpsweepPolicy;
-        typedef AltDownsweepPolicy AltUpsweepPolicy;
-
-        // Single-tile policy
-        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(256, 19, DominantT),  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS> SingleTilePolicy;
-
-        // Segmented policies
-        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(192, 31, DominantT),  BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS>   SegmentedPolicy;
-        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(256, 11, DominantT),  BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1>       AltSegmentedPolicy;
-    };
-
-
-    /// SM60 (GP100)
-    struct Policy600 : ChainedPolicy<600, Policy600, Policy500>
-    {
-        enum {
-            PRIMARY_RADIX_BITS      = (sizeof(KeyT) > 1) ? 7 : 5,    // 6.9B 32b keys/s (Quadro P100)
-            SINGLE_TILE_RADIX_BITS  = (sizeof(KeyT) > 1) ? 6 : 5,
-            SEGMENTED_RADIX_BITS    = (sizeof(KeyT) > 1) ? 6 : 5,    // 5.9B 32b segmented keys/s (Quadro P100)
-        };
-
-        // ScanPolicy
-        typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
-
-        // Downsweep policies
-        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(256, 25, DominantT),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MATCH, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>   DownsweepPolicy;
-        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(192, 39, DominantT),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1>   AltDownsweepPolicy;
-
-        // Upsweep policies
-        typedef DownsweepPolicy UpsweepPolicy;
-        typedef AltDownsweepPolicy AltUpsweepPolicy;
-
-        // Single-tile policy
-        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(256, 19, DominantT),  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS>          SingleTilePolicy;
-
-        // Segmented policies
-        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(192, 39, DominantT),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS>     SegmentedPolicy;
-        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(384, 11, DominantT),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1> AltSegmentedPolicy;
-
-    };
-
-
-    /// SM61 (GP104)
-    struct Policy610 : ChainedPolicy<610, Policy610, Policy600>
-    {
-        enum {
-            PRIMARY_RADIX_BITS      = (sizeof(KeyT) > 1) ? 7 : 5,    // 3.4B 32b keys/s, 1.83B 32b pairs/s (1080)
-            SINGLE_TILE_RADIX_BITS  = (sizeof(KeyT) > 1) ? 6 : 5,
-            SEGMENTED_RADIX_BITS    = (sizeof(KeyT) > 1) ? 6 : 5,    // 3.3B 32b segmented keys/s (1080)
-        };
-
-        // ScanPolicy
-        typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
-
-        // Downsweep policies
-        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(384, 31, DominantT),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MATCH, BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS>   DownsweepPolicy;
-        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(256, 35, DominantT),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS - 1>   AltDownsweepPolicy;
-
-        // Upsweep policies
-        typedef AgentRadixSortUpsweepPolicy <CUB_SCALED_GRANULARITIES(128, 16, DominantT), LOAD_LDG, PRIMARY_RADIX_BITS>        UpsweepPolicy;
-        typedef AgentRadixSortUpsweepPolicy <CUB_SCALED_GRANULARITIES(128, 16, DominantT), LOAD_LDG, PRIMARY_RADIX_BITS - 1>    AltUpsweepPolicy;
-
-        // Single-tile policy
-        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(256, 19, DominantT),  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS>          SingleTilePolicy;
-
-        // Segmented policies
-        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(192, 39, DominantT),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS>     SegmentedPolicy;
-        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(384, 11, DominantT),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1> AltSegmentedPolicy;
-    };
-
-
-    /// SM62 (Tegra, less RF)
-    struct Policy620 : ChainedPolicy<620, Policy620, Policy610>
-    {
-        enum {
-            PRIMARY_RADIX_BITS      = 5,
-            ALT_RADIX_BITS          = PRIMARY_RADIX_BITS - 1,
-        };
-
-        // ScanPolicy
-        typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
-
-        // Downsweep policies
-        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(256, 16, DominantT),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS>   DownsweepPolicy;
-        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(256, 16, DominantT),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, ALT_RADIX_BITS>       AltDownsweepPolicy;
-
-        // Upsweep policies
-        typedef DownsweepPolicy UpsweepPolicy;
-        typedef AltDownsweepPolicy AltUpsweepPolicy;
-
-        // Single-tile policy
-        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(256, 19, DominantT),  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> SingleTilePolicy;
-
-        // Segmented policies
-        typedef DownsweepPolicy     SegmentedPolicy;
-        typedef AltDownsweepPolicy  AltSegmentedPolicy;
-    };
-
-
-    /// SM70 (GV100)
-    struct Policy700 : ChainedPolicy<700, Policy700, Policy620>
-    {
-        enum {
-            PRIMARY_RADIX_BITS      = (sizeof(KeyT) > 1) ? 7 : 5,    // 7.62B 32b keys/s (GV100)
-            SINGLE_TILE_RADIX_BITS  = (sizeof(KeyT) > 1) ? 6 : 5,
-            SEGMENTED_RADIX_BITS    = (sizeof(KeyT) > 1) ? 6 : 5,    // 8.7B 32b segmented keys/s (GV100)
-        };
-
-        // ScanPolicy
-        typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
-
-        // Downsweep policies
-        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(256, 25, DominantT),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MATCH, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>   DownsweepPolicy;
-        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(256, 25, DominantT),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1>   AltDownsweepPolicy;
-
-        // Upsweep policies
-        typedef DownsweepPolicy UpsweepPolicy;
-        typedef AltDownsweepPolicy AltUpsweepPolicy;
-
-        // Single-tile policy
-        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(256, 19, DominantT),  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS>          SingleTilePolicy;
-
-        // Segmented policies
-        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(192, 39, DominantT),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS>     SegmentedPolicy;
-        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(384, 11, DominantT),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1> AltSegmentedPolicy;
-    };
-
-
-    /// MaxPolicy
-    typedef Policy700 MaxPolicy;
-
-
-};
-
-
-
-/******************************************************************************
- * Single-problem dispatch
- ******************************************************************************/
-
-/**
- * Utility class for dispatching the appropriately-tuned kernels for device-wide radix sort
- */
-template <
-    bool     IS_DESCENDING, ///< Whether or not the sorted-order is high-to-low
-    typename KeyT,          ///< Key type
-    typename ValueT,        ///< Value type
-    typename OffsetT>       ///< Signed integer type for global offsets
-struct DispatchRadixSort :
-    DeviceRadixSortPolicy<KeyT, ValueT, OffsetT>
-{
-    //------------------------------------------------------------------------------
-    // Constants
-    //------------------------------------------------------------------------------
-
-    enum
-    {
-        // Whether this is a keys-only (or key-value) sort
-        KEYS_ONLY = (Equals<ValueT, NullType>::VALUE),
-    };
-
-
-    //------------------------------------------------------------------------------
-    // Problem state
-    //------------------------------------------------------------------------------
-
-    void                    *d_temp_storage;        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-    size_t                  &temp_storage_bytes;    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-    DoubleBuffer<KeyT>      &d_keys;                ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
-    DoubleBuffer<ValueT>    &d_values;              ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
-    OffsetT                 num_items;              ///< [in] Number of items to sort
-    int                     begin_bit;              ///< [in] The beginning (least-significant) bit index needed for key comparison
-    int                     end_bit;                ///< [in] The past-the-end (most-significant) bit index needed for key comparison
-    cudaStream_t            stream;                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-    bool                    debug_synchronous;      ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    int                     ptx_version;            ///< [in] PTX version
-    bool                    is_overwrite_okay;      ///< [in] Whether is okay to overwrite source buffers
-
-
-    //------------------------------------------------------------------------------
-    // Constructor
-    //------------------------------------------------------------------------------
-
-    /// Constructor
-    CUB_RUNTIME_FUNCTION __forceinline__
-    DispatchRadixSort(
-        void*                   d_temp_storage,
-        size_t                  &temp_storage_bytes,
-        DoubleBuffer<KeyT>      &d_keys,
-        DoubleBuffer<ValueT>    &d_values,
-        OffsetT                 num_items,
-        int                     begin_bit,
-        int                     end_bit,
-        bool                    is_overwrite_okay,
-        cudaStream_t            stream,
-        bool                    debug_synchronous,
-        int                     ptx_version)
-    :
-        d_temp_storage(d_temp_storage),
-        temp_storage_bytes(temp_storage_bytes),
-        d_keys(d_keys),
-        d_values(d_values),
-        num_items(num_items),
-        begin_bit(begin_bit),
-        end_bit(end_bit),
-        stream(stream),
-        debug_synchronous(debug_synchronous),
-        ptx_version(ptx_version),
-        is_overwrite_okay(is_overwrite_okay)
-    {}
-
-
-    //------------------------------------------------------------------------------
-    // Small-problem (single tile) invocation
-    //------------------------------------------------------------------------------
-
-    /// Invoke a single block to sort in-core
-    template <
-        typename                ActivePolicyT,          ///< Umbrella policy active for the target device
-        typename                SingleTileKernelT>      ///< Function type of cub::DeviceRadixSortSingleTileKernel
-    CUB_RUNTIME_FUNCTION __forceinline__
-    cudaError_t InvokeSingleTile(
-        SingleTileKernelT       single_tile_kernel)     ///< [in] Kernel function pointer to parameterization of cub::DeviceRadixSortSingleTileKernel
-    {
-#ifndef CUB_RUNTIME_ENABLED
-        (void)single_tile_kernel;
-        // Kernel launch not supported from this device
-        return CubDebug(cudaErrorNotSupported );
-#else
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Return if the caller is simply requesting the size of the storage allocation
-            if (d_temp_storage == NULL)
-            {
-                temp_storage_bytes = 1;
-                break;
-            }
-
-            // Return if empty problem
-            if (num_items == 0)
-                break;
-
-            // Log single_tile_kernel configuration
-            if (debug_synchronous)
-                _CubLog("Invoking single_tile_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n",
-                    1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, (long long) stream,
-                    ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD, 1, begin_bit, ActivePolicyT::SingleTilePolicy::RADIX_BITS);
-
-            // Invoke upsweep_kernel with same grid size as downsweep_kernel
-            single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>(
-                d_keys.Current(),
-                d_keys.Alternate(),
-                d_values.Current(),
-                d_values.Alternate(),
-                num_items,
-                begin_bit,
-                end_bit);
-
-            // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
-
-            // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-
-            // Update selector
-            d_keys.selector ^= 1;
-            d_values.selector ^= 1;
-        }
-        while (0);
-
-        return error;
-
-#endif // CUB_RUNTIME_ENABLED
-    }
-
-
-    //------------------------------------------------------------------------------
-    // Normal problem size invocation
-    //------------------------------------------------------------------------------
-
-    /**
-     * Invoke a three-kernel sorting pass at the current bit.
-     */
-    template <typename PassConfigT>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    cudaError_t InvokePass(
-        const KeyT      *d_keys_in,
-        KeyT            *d_keys_out,
-        const ValueT    *d_values_in,
-        ValueT          *d_values_out,
-        OffsetT         *d_spine,
-        int             spine_length,
-        int             &current_bit,
-        PassConfigT     &pass_config)
-    {
-        cudaError error = cudaSuccess;
-        do
-        {
-            int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit));
-
-            // Log upsweep_kernel configuration
-            if (debug_synchronous)
-                _CubLog("Invoking upsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n",
-                pass_config.even_share.grid_size, pass_config.upsweep_config.block_threads, (long long) stream,
-                pass_config.upsweep_config.items_per_thread, pass_config.upsweep_config.sm_occupancy, current_bit, pass_bits);
-
-            // Invoke upsweep_kernel with same grid size as downsweep_kernel
-            pass_config.upsweep_kernel<<<pass_config.even_share.grid_size, pass_config.upsweep_config.block_threads, 0, stream>>>(
-                d_keys_in,
-                d_spine,
-                num_items,
-                current_bit,
-                pass_bits,
-                pass_config.even_share);
-
-            // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
-
-            // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-
-            // Log scan_kernel configuration
-            if (debug_synchronous) _CubLog("Invoking scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n",
-                1, pass_config.scan_config.block_threads, (long long) stream, pass_config.scan_config.items_per_thread);
-
-            // Invoke scan_kernel
-            pass_config.scan_kernel<<<1, pass_config.scan_config.block_threads, 0, stream>>>(
-                d_spine,
-                spine_length);
-
-            // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
-
-            // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-
-            // Log downsweep_kernel configuration
-            if (debug_synchronous) _CubLog("Invoking downsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
-                pass_config.even_share.grid_size, pass_config.downsweep_config.block_threads, (long long) stream,
-                pass_config.downsweep_config.items_per_thread, pass_config.downsweep_config.sm_occupancy);
-
-            // Invoke downsweep_kernel
-            pass_config.downsweep_kernel<<<pass_config.even_share.grid_size, pass_config.downsweep_config.block_threads, 0, stream>>>(
-                d_keys_in,
-                d_keys_out,
-                d_values_in,
-                d_values_out,
-                d_spine,
-                num_items,
-                current_bit,
-                pass_bits,
-                pass_config.even_share);
-
-            // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
-
-            // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-
-            // Update current bit
-            current_bit += pass_bits;
-        }
-        while (0);
-
-        return error;
-    }
-
-
-
-    /// Pass configuration structure
-    template <
-        typename UpsweepKernelT,
-        typename ScanKernelT,
-        typename DownsweepKernelT>
-    struct PassConfig
-    {
-        UpsweepKernelT          upsweep_kernel;
-        KernelConfig            upsweep_config;
-        ScanKernelT             scan_kernel;
-        KernelConfig            scan_config;
-        DownsweepKernelT        downsweep_kernel;
-        KernelConfig            downsweep_config;
-        int                     radix_bits;
-        int                     radix_digits;
-        int                     max_downsweep_grid_size;
-        GridEvenShare<OffsetT>  even_share;
-
-        /// Initialize pass configuration
-        template <
-            typename UpsweepPolicyT,
-            typename ScanPolicyT,
-            typename DownsweepPolicyT>
-        CUB_RUNTIME_FUNCTION __forceinline__
-        cudaError_t InitPassConfig(
-            UpsweepKernelT      upsweep_kernel,
-            ScanKernelT         scan_kernel,
-            DownsweepKernelT    downsweep_kernel,
-            int                 ptx_version,
-            int                 sm_count,
-            int                 num_items)
-        {
-            cudaError error = cudaSuccess;
-            do
-            {
-                this->upsweep_kernel    = upsweep_kernel;
-                this->scan_kernel       = scan_kernel;
-                this->downsweep_kernel  = downsweep_kernel;
-                radix_bits              = DownsweepPolicyT::RADIX_BITS;
-                radix_digits            = 1 << radix_bits;
-
-                if (CubDebug(error = upsweep_config.Init<UpsweepPolicyT>(upsweep_kernel))) break;
-                if (CubDebug(error = scan_config.Init<ScanPolicyT>(scan_kernel))) break;
-                if (CubDebug(error = downsweep_config.Init<DownsweepPolicyT>(downsweep_kernel))) break;
-
-                max_downsweep_grid_size = (downsweep_config.sm_occupancy * sm_count) * CUB_SUBSCRIPTION_FACTOR(ptx_version);
-
-                even_share.DispatchInit(
-                    num_items,
-                    max_downsweep_grid_size,
-                    CUB_MAX(downsweep_config.tile_size, upsweep_config.tile_size));
-
-            }
-            while (0);
-            return error;
-        }
-
-    };
-
-
-    /// Invocation (run multiple digit passes)
-    template <
-        typename            ActivePolicyT,          ///< Umbrella policy active for the target device
-        typename            UpsweepKernelT,         ///< Function type of cub::DeviceRadixSortUpsweepKernel
-        typename            ScanKernelT,            ///< Function type of cub::SpineScanKernel
-        typename            DownsweepKernelT>       ///< Function type of cub::DeviceRadixSortDownsweepKernel
-    CUB_RUNTIME_FUNCTION __forceinline__
-    cudaError_t InvokePasses(
-        UpsweepKernelT      upsweep_kernel,         ///< [in] Kernel function pointer to parameterization of cub::DeviceRadixSortUpsweepKernel
-        UpsweepKernelT      alt_upsweep_kernel,     ///< [in] Alternate kernel function pointer to parameterization of cub::DeviceRadixSortUpsweepKernel
-        ScanKernelT         scan_kernel,            ///< [in] Kernel function pointer to parameterization of cub::SpineScanKernel
-        DownsweepKernelT    downsweep_kernel,       ///< [in] Kernel function pointer to parameterization of cub::DeviceRadixSortDownsweepKernel
-        DownsweepKernelT    alt_downsweep_kernel)   ///< [in] Alternate kernel function pointer to parameterization of cub::DeviceRadixSortDownsweepKernel
-    {
-#ifndef CUB_RUNTIME_ENABLED
-        (void)upsweep_kernel;
-        (void)alt_upsweep_kernel;
-        (void)scan_kernel;
-        (void)downsweep_kernel;
-        (void)alt_downsweep_kernel;
-
-        // Kernel launch not supported from this device
-        return CubDebug(cudaErrorNotSupported );
-#else
-
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get device ordinal
-            int device_ordinal;
-            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
-
-            // Get SM count
-            int sm_count;
-            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
-
-            // Init regular and alternate-digit kernel configurations
-            PassConfig<UpsweepKernelT, ScanKernelT, DownsweepKernelT> pass_config, alt_pass_config;
-            if ((error = pass_config.template InitPassConfig<
-                    typename ActivePolicyT::UpsweepPolicy, 
-                    typename ActivePolicyT::ScanPolicy, 
-                    typename ActivePolicyT::DownsweepPolicy>(
-                upsweep_kernel, scan_kernel, downsweep_kernel, ptx_version, sm_count, num_items))) break;
-
-            if ((error = alt_pass_config.template InitPassConfig<
-                    typename ActivePolicyT::AltUpsweepPolicy, 
-                    typename ActivePolicyT::ScanPolicy, 
-                    typename ActivePolicyT::AltDownsweepPolicy>(
-                alt_upsweep_kernel, scan_kernel, alt_downsweep_kernel, ptx_version, sm_count, num_items))) break;
-
-            // Get maximum spine length
-            int max_grid_size       = CUB_MAX(pass_config.max_downsweep_grid_size, alt_pass_config.max_downsweep_grid_size);
-            int spine_length        = (max_grid_size * pass_config.radix_digits) + pass_config.scan_config.tile_size;
-
-            // Temporary storage allocation requirements
-            void* allocations[3];
-            size_t allocation_sizes[3] =
-            {
-                spine_length * sizeof(OffsetT),                                         // bytes needed for privatized block digit histograms
-                (is_overwrite_okay) ? 0 : num_items * sizeof(KeyT),                     // bytes needed for 3rd keys buffer
-                (is_overwrite_okay || (KEYS_ONLY)) ? 0 : num_items * sizeof(ValueT),    // bytes needed for 3rd values buffer
-            };
-
-            // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
-            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
-
-            // Return if the caller is simply requesting the size of the storage allocation
-            if (d_temp_storage == NULL)
-                return cudaSuccess;
-
-            // Pass planning.  Run passes of the alternate digit-size configuration until we have an even multiple of our preferred digit size
-            int num_bits            = end_bit - begin_bit;
-            int num_passes          = (num_bits + pass_config.radix_bits - 1) / pass_config.radix_bits;
-            bool is_num_passes_odd  = num_passes & 1;
-            int max_alt_passes      = (num_passes * pass_config.radix_bits) - num_bits;
-            int alt_end_bit         = CUB_MIN(end_bit, begin_bit + (max_alt_passes * alt_pass_config.radix_bits));
-
-            // Alias the temporary storage allocations
-            OffsetT *d_spine = static_cast<OffsetT*>(allocations[0]);
-
-            DoubleBuffer<KeyT> d_keys_remaining_passes(
-                (is_overwrite_okay || is_num_passes_odd) ? d_keys.Alternate() : static_cast<KeyT*>(allocations[1]),
-                (is_overwrite_okay) ? d_keys.Current() : (is_num_passes_odd) ? static_cast<KeyT*>(allocations[1]) : d_keys.Alternate());
-
-            DoubleBuffer<ValueT> d_values_remaining_passes(
-                (is_overwrite_okay || is_num_passes_odd) ? d_values.Alternate() : static_cast<ValueT*>(allocations[2]),
-                (is_overwrite_okay) ? d_values.Current() : (is_num_passes_odd) ? static_cast<ValueT*>(allocations[2]) : d_values.Alternate());
-
-            // Run first pass, consuming from the input's current buffers
-            int current_bit = begin_bit;
-            if (CubDebug(error = InvokePass(
-                d_keys.Current(), d_keys_remaining_passes.Current(),
-                d_values.Current(), d_values_remaining_passes.Current(),
-                d_spine, spine_length, current_bit,
-                (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;
-
-            // Run remaining passes
-            while (current_bit < end_bit)
-            {
-                if (CubDebug(error = InvokePass(
-                    d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector],    d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
-                    d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector],  d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
-                    d_spine, spine_length, current_bit,
-                    (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;;
-
-                // Invert selectors
-                d_keys_remaining_passes.selector ^= 1;
-                d_values_remaining_passes.selector ^= 1;
-            }
-
-            // Update selector
-            if (!is_overwrite_okay) {
-                num_passes = 1; // Sorted data always ends up in the other vector
-            }
-
-            d_keys.selector = (d_keys.selector + num_passes) & 1;
-            d_values.selector = (d_values.selector + num_passes) & 1;
-        }
-        while (0);
-
-        return error;
-
-#endif // CUB_RUNTIME_ENABLED
-    }
-
-
-    //------------------------------------------------------------------------------
-    // Chained policy invocation
-    //------------------------------------------------------------------------------
-
-    /// Invocation
-    template <typename ActivePolicyT>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    cudaError_t Invoke()
-    {
-        typedef typename DispatchRadixSort::MaxPolicy       MaxPolicyT;
-        typedef typename ActivePolicyT::SingleTilePolicy    SingleTilePolicyT;
-
-        // Force kernel code-generation in all compiler passes
-        if (num_items <= (SingleTilePolicyT::BLOCK_THREADS * SingleTilePolicyT::ITEMS_PER_THREAD))
-        {
-            // Small, single tile size
-            return InvokeSingleTile<ActivePolicyT>(
-                DeviceRadixSortSingleTileKernel<MaxPolicyT, IS_DESCENDING, KeyT, ValueT, OffsetT>);
-        }
-        else
-        {
-            // Regular size
-            return InvokePasses<ActivePolicyT>(
-                DeviceRadixSortUpsweepKernel<   MaxPolicyT, false,   IS_DESCENDING, KeyT, OffsetT>,
-                DeviceRadixSortUpsweepKernel<   MaxPolicyT, true,    IS_DESCENDING, KeyT, OffsetT>,
-                RadixSortScanBinsKernel<        MaxPolicyT, OffsetT>,
-                DeviceRadixSortDownsweepKernel< MaxPolicyT, false,   IS_DESCENDING, KeyT, ValueT, OffsetT>,
-                DeviceRadixSortDownsweepKernel< MaxPolicyT, true,    IS_DESCENDING, KeyT, ValueT, OffsetT>);
-        }
-    }
-
-
-    //------------------------------------------------------------------------------
-    // Dispatch entrypoints
-    //------------------------------------------------------------------------------
-
-    /**
-     * Internal dispatch routine
-     */
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Dispatch(
-        void*                   d_temp_storage,         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                  &temp_storage_bytes,    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        DoubleBuffer<KeyT>      &d_keys,                ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
-        DoubleBuffer<ValueT>    &d_values,              ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
-        OffsetT                 num_items,              ///< [in] Number of items to sort
-        int                     begin_bit,              ///< [in] The beginning (least-significant) bit index needed for key comparison
-        int                     end_bit,                ///< [in] The past-the-end (most-significant) bit index needed for key comparison
-        bool                    is_overwrite_okay,      ///< [in] Whether is okay to overwrite source buffers
-        cudaStream_t            stream,                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                    debug_synchronous)      ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        typedef typename DispatchRadixSort::MaxPolicy MaxPolicyT;
-
-        cudaError_t error;
-        do {
-            // Get PTX version
-            int ptx_version;
-            if (CubDebug(error = PtxVersion(ptx_version))) break;
-
-            // Create dispatch functor
-            DispatchRadixSort dispatch(
-                d_temp_storage, temp_storage_bytes,
-                d_keys, d_values,
-                num_items, begin_bit, end_bit, is_overwrite_okay,
-                stream, debug_synchronous, ptx_version);
-
-            // Dispatch to chained policy
-            if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;
-
-        } while (0);
-
-        return error;
-    }
-};
-
-
-
-
-/******************************************************************************
- * Segmented dispatch
- ******************************************************************************/
-
-/**
- * Utility class for dispatching the appropriately-tuned kernels for segmented device-wide radix sort
- */
-template <
-    bool     IS_DESCENDING,     ///< Whether or not the sorted-order is high-to-low
-    typename KeyT,              ///< Key type
-    typename ValueT,            ///< Value type
-    typename OffsetIteratorT,   ///< Random-access input iterator type for reading segment offsets \iterator
-    typename OffsetT>           ///< Signed integer type for global offsets
-struct DispatchSegmentedRadixSort :
-    DeviceRadixSortPolicy<KeyT, ValueT, OffsetT>
-{
-    //------------------------------------------------------------------------------
-    // Constants
-    //------------------------------------------------------------------------------
-
-    enum
-    {
-        // Whether this is a keys-only (or key-value) sort
-        KEYS_ONLY = (Equals<ValueT, NullType>::VALUE),
-    };
-
-
-    //------------------------------------------------------------------------------
-    // Parameter members
-    //------------------------------------------------------------------------------
-
-    void                    *d_temp_storage;        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-    size_t                  &temp_storage_bytes;    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-    DoubleBuffer<KeyT>      &d_keys;                ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
-    DoubleBuffer<ValueT>    &d_values;              ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
-    OffsetT                 num_items;              ///< [in] Number of items to sort
-    OffsetT                 num_segments;           ///< [in] The number of segments that comprise the sorting data
-    OffsetIteratorT         d_begin_offsets;        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-    OffsetIteratorT         d_end_offsets;          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-    int                     begin_bit;              ///< [in] The beginning (least-significant) bit index needed for key comparison
-    int                     end_bit;                ///< [in] The past-the-end (most-significant) bit index needed for key comparison
-    cudaStream_t            stream;                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-    bool                    debug_synchronous;      ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    int                     ptx_version;            ///< [in] PTX version
-    bool                    is_overwrite_okay;      ///< [in] Whether is okay to overwrite source buffers
-
-
-    //------------------------------------------------------------------------------
-    // Constructors
-    //------------------------------------------------------------------------------
-
-    /// Constructor
-    CUB_RUNTIME_FUNCTION __forceinline__
-    DispatchSegmentedRadixSort(
-        void*                   d_temp_storage,
-        size_t                  &temp_storage_bytes,
-        DoubleBuffer<KeyT>      &d_keys,
-        DoubleBuffer<ValueT>    &d_values,
-        OffsetT                 num_items,
-        OffsetT                 num_segments,
-        OffsetIteratorT         d_begin_offsets,
-        OffsetIteratorT         d_end_offsets,
-        int                     begin_bit,
-        int                     end_bit,
-        bool                    is_overwrite_okay,
-        cudaStream_t            stream,
-        bool                    debug_synchronous,
-        int                     ptx_version)
-    :
-        d_temp_storage(d_temp_storage),
-        temp_storage_bytes(temp_storage_bytes),
-        d_keys(d_keys),
-        d_values(d_values),
-        num_items(num_items),
-        num_segments(num_segments),
-        d_begin_offsets(d_begin_offsets),
-        d_end_offsets(d_end_offsets),
-        begin_bit(begin_bit),
-        end_bit(end_bit),
-        is_overwrite_okay(is_overwrite_okay),
-        stream(stream),
-        debug_synchronous(debug_synchronous),
-        ptx_version(ptx_version)
-    {}
-
-
-    //------------------------------------------------------------------------------
-    // Multi-segment invocation
-    //------------------------------------------------------------------------------
-
-    /// Invoke a three-kernel sorting pass at the current bit.
-    template <typename PassConfigT>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    cudaError_t InvokePass(
-        const KeyT      *d_keys_in,
-        KeyT            *d_keys_out,
-        const ValueT    *d_values_in,
-        ValueT          *d_values_out,
-        int             &current_bit,
-        PassConfigT     &pass_config)
-    {
-        cudaError error = cudaSuccess;
-        do
-        {
-            int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit));
-
-            // Log kernel configuration
-            if (debug_synchronous)
-                _CubLog("Invoking segmented_kernels<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n",
-                    num_segments, pass_config.segmented_config.block_threads, (long long) stream,
-                pass_config.segmented_config.items_per_thread, pass_config.segmented_config.sm_occupancy, current_bit, pass_bits);
-
-            pass_config.segmented_kernel<<<num_segments, pass_config.segmented_config.block_threads, 0, stream>>>(
-                d_keys_in, d_keys_out,
-                d_values_in,  d_values_out,
-                d_begin_offsets, d_end_offsets, num_segments,
-                current_bit, pass_bits);
-
-            // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
-
-            // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-
-            // Update current bit
-            current_bit += pass_bits;
-        }
-        while (0);
-
-        return error;
-    }
-
-
-    /// PassConfig data structure
-    template <typename SegmentedKernelT>
-    struct PassConfig
-    {
-        SegmentedKernelT    segmented_kernel;
-        KernelConfig        segmented_config;
-        int                 radix_bits;
-        int                 radix_digits;
-
-        /// Initialize pass configuration
-        template <typename SegmentedPolicyT>
-        CUB_RUNTIME_FUNCTION __forceinline__
-        cudaError_t InitPassConfig(SegmentedKernelT segmented_kernel)
-        {
-            this->segmented_kernel  = segmented_kernel;
-            this->radix_bits        = SegmentedPolicyT::RADIX_BITS;
-            this->radix_digits      = 1 << radix_bits;
-
-            return CubDebug(segmented_config.Init<SegmentedPolicyT>(segmented_kernel));
-        }
-    };
-
-
-    /// Invocation (run multiple digit passes)
-    template <
-        typename                ActivePolicyT,          ///< Umbrella policy active for the target device
-        typename                SegmentedKernelT>       ///< Function type of cub::DeviceSegmentedRadixSortKernel
-    CUB_RUNTIME_FUNCTION __forceinline__
-    cudaError_t InvokePasses(
-        SegmentedKernelT     segmented_kernel,          ///< [in] Kernel function pointer to parameterization of cub::DeviceSegmentedRadixSortKernel
-        SegmentedKernelT     alt_segmented_kernel)      ///< [in] Alternate kernel function pointer to parameterization of cub::DeviceSegmentedRadixSortKernel
-    {
-#ifndef CUB_RUNTIME_ENABLED
-      (void)segmented_kernel;
-      (void)alt_segmented_kernel;
-
-        // Kernel launch not supported from this device
-        return CubDebug(cudaErrorNotSupported );
-#else
-
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Init regular and alternate kernel configurations
-            PassConfig<SegmentedKernelT> pass_config, alt_pass_config;
-            if ((error = pass_config.template       InitPassConfig<typename ActivePolicyT::SegmentedPolicy>(segmented_kernel))) break;
-            if ((error = alt_pass_config.template   InitPassConfig<typename ActivePolicyT::AltSegmentedPolicy>(alt_segmented_kernel))) break;
-
-            // Temporary storage allocation requirements
-            void* allocations[2];
-            size_t allocation_sizes[2] =
-            {
-                (is_overwrite_okay) ? 0 : num_items * sizeof(KeyT),                      // bytes needed for 3rd keys buffer
-                (is_overwrite_okay || (KEYS_ONLY)) ? 0 : num_items * sizeof(ValueT),     // bytes needed for 3rd values buffer
-            };
-
-            // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
-            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
-
-            // Return if the caller is simply requesting the size of the storage allocation
-            if (d_temp_storage == NULL)
-            {
-                if (temp_storage_bytes == 0)
-                    temp_storage_bytes = 1;
-                return cudaSuccess;
-            }
-
-            // Pass planning.  Run passes of the alternate digit-size configuration until we have an even multiple of our preferred digit size
-            int radix_bits          = ActivePolicyT::SegmentedPolicy::RADIX_BITS;
-            int alt_radix_bits      = ActivePolicyT::AltSegmentedPolicy::RADIX_BITS;
-            int num_bits            = end_bit - begin_bit;
-            int num_passes          = (num_bits + radix_bits - 1) / radix_bits;
-            bool is_num_passes_odd  = num_passes & 1;
-            int max_alt_passes      = (num_passes * radix_bits) - num_bits;
-            int alt_end_bit         = CUB_MIN(end_bit, begin_bit + (max_alt_passes * alt_radix_bits));
-
-            DoubleBuffer<KeyT> d_keys_remaining_passes(
-                (is_overwrite_okay || is_num_passes_odd) ? d_keys.Alternate() : static_cast<KeyT*>(allocations[0]),
-                (is_overwrite_okay) ? d_keys.Current() : (is_num_passes_odd) ? static_cast<KeyT*>(allocations[0]) : d_keys.Alternate());
-
-            DoubleBuffer<ValueT> d_values_remaining_passes(
-                (is_overwrite_okay || is_num_passes_odd) ? d_values.Alternate() : static_cast<ValueT*>(allocations[1]),
-                (is_overwrite_okay) ? d_values.Current() : (is_num_passes_odd) ? static_cast<ValueT*>(allocations[1]) : d_values.Alternate());
-
-            // Run first pass, consuming from the input's current buffers
-            int current_bit = begin_bit;
-
-            if (CubDebug(error = InvokePass(
-                d_keys.Current(), d_keys_remaining_passes.Current(),
-                d_values.Current(), d_values_remaining_passes.Current(),
-                current_bit,
-                (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;
-
-            // Run remaining passes
-            while (current_bit < end_bit)
-            {
-                if (CubDebug(error = InvokePass(
-                    d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector],    d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
-                    d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector],  d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
-                    current_bit,
-                    (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;
-
-                // Invert selectors and update current bit
-                d_keys_remaining_passes.selector ^= 1;
-                d_values_remaining_passes.selector ^= 1;
-            }
-
-            // Update selector
-            if (!is_overwrite_okay) {
-                num_passes = 1; // Sorted data always ends up in the other vector
-            }
-
-            d_keys.selector = (d_keys.selector + num_passes) & 1;
-            d_values.selector = (d_values.selector + num_passes) & 1;
-        }
-        while (0);
-
-        return error;
-
-#endif // CUB_RUNTIME_ENABLED
-    }
-
-
-    //------------------------------------------------------------------------------
-    // Chained policy invocation
-    //------------------------------------------------------------------------------
-
-    /// Invocation
-    template <typename ActivePolicyT>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    cudaError_t Invoke()
-    {
-        typedef typename DispatchSegmentedRadixSort::MaxPolicy MaxPolicyT;
-
-        // Force kernel code-generation in all compiler passes
-        return InvokePasses<ActivePolicyT>(
-            DeviceSegmentedRadixSortKernel<MaxPolicyT, false,   IS_DESCENDING, KeyT, ValueT, OffsetIteratorT, OffsetT>,
-            DeviceSegmentedRadixSortKernel<MaxPolicyT, true,    IS_DESCENDING, KeyT, ValueT, OffsetIteratorT, OffsetT>);
-    }
-
-
-    //------------------------------------------------------------------------------
-    // Dispatch entrypoints
-    //------------------------------------------------------------------------------
-
-
-    /// Internal dispatch routine
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Dispatch(
-        void*                   d_temp_storage,         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                  &temp_storage_bytes,    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        DoubleBuffer<KeyT>      &d_keys,                ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
-        DoubleBuffer<ValueT>    &d_values,              ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
-        int                     num_items,              ///< [in] Number of items to sort
-        int                     num_segments,           ///< [in] The number of segments that comprise the sorting data
-        OffsetIteratorT         d_begin_offsets,        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-        OffsetIteratorT         d_end_offsets,          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-        int                     begin_bit,              ///< [in] The beginning (least-significant) bit index needed for key comparison
-        int                     end_bit,                ///< [in] The past-the-end (most-significant) bit index needed for key comparison
-        bool                    is_overwrite_okay,      ///< [in] Whether is okay to overwrite source buffers
-        cudaStream_t            stream,                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                    debug_synchronous)      ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        typedef typename DispatchSegmentedRadixSort::MaxPolicy MaxPolicyT;
-
-        cudaError_t error;
-        do {
-            // Get PTX version
-            int ptx_version;
-            if (CubDebug(error = PtxVersion(ptx_version))) break;
-
-            // Create dispatch functor
-            DispatchSegmentedRadixSort dispatch(
-                d_temp_storage, temp_storage_bytes,
-                d_keys, d_values,
-                num_items, num_segments, d_begin_offsets, d_end_offsets,
-                begin_bit, end_bit, is_overwrite_okay,
-                stream, debug_synchronous, ptx_version);
-
-            // Dispatch to chained policy
-            if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;
-
-        } while (0);
-
-        return error;
-    }
-};
-
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce.cuh b/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce.cuh
deleted file mode 100644
index 44b1233a4..000000000
--- a/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce.cuh
+++ /dev/null
@@ -1,864 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data items residing within device-accessible memory.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-
-#include "../../agent/agent_reduce.cuh"
-#include "../../iterator/arg_index_input_iterator.cuh"
-#include "../../thread/thread_operators.cuh"
-#include "../../grid/grid_even_share.cuh"
-#include "../../iterator/arg_index_input_iterator.cuh"
-#include "../../util_debug.cuh"
-#include "../../util_device.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/******************************************************************************
- * Kernel entry points
- *****************************************************************************/
-
-/**
- * Reduce region kernel entry point (multi-block).  Computes privatized reductions, one per thread block.
- */
-template <
-    typename                ChainedPolicyT,             ///< Chained tuning policy
-    typename                InputIteratorT,             ///< Random-access input iterator type for reading input items \iterator
-    typename                OutputIteratorT,            ///< Output iterator type for recording the reduced aggregate \iterator
-    typename                OffsetT,                    ///< Signed integer type for global offsets
-    typename                ReductionOpT>               ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
-__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS))
-__global__ void DeviceReduceKernel(
-    InputIteratorT          d_in,                       ///< [in] Pointer to the input sequence of data items
-    OutputIteratorT         d_out,                      ///< [out] Pointer to the output aggregate
-    OffsetT                 num_items,                  ///< [in] Total number of input data items
-    GridEvenShare<OffsetT>  even_share,                 ///< [in] Even-share descriptor for mapping an equal number of tiles onto each thread block
-    ReductionOpT            reduction_op)               ///< [in] Binary reduction functor
-{
-    // The output value type
-    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
-        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
-
-    // Thread block type for reducing input tiles
-    typedef AgentReduce<
-            typename ChainedPolicyT::ActivePolicy::ReducePolicy,
-            InputIteratorT,
-            OutputIteratorT,
-            OffsetT,
-            ReductionOpT>
-        AgentReduceT;
-
-    // Shared memory storage
-    __shared__ typename AgentReduceT::TempStorage temp_storage;
-
-    // Consume input tiles
-    OutputT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeTiles(even_share);
-
-    // Output result
-    if (threadIdx.x == 0)
-        d_out[blockIdx.x] = block_aggregate;
-}
-
-
-/**
- * Reduce a single tile kernel entry point (single-block).  Can be used to aggregate privatized thread block reductions from a previous multi-block reduction pass.
- */
-template <
-    typename                ChainedPolicyT,             ///< Chained tuning policy
-    typename                InputIteratorT,             ///< Random-access input iterator type for reading input items \iterator
-    typename                OutputIteratorT,            ///< Output iterator type for recording the reduced aggregate \iterator
-    typename                OffsetT,                    ///< Signed integer type for global offsets
-    typename                ReductionOpT,               ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
-    typename                OutputT>                     ///< Data element type that is convertible to the \p value type of \p OutputIteratorT
-__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1)
-__global__ void DeviceReduceSingleTileKernel(
-    InputIteratorT          d_in,                       ///< [in] Pointer to the input sequence of data items
-    OutputIteratorT         d_out,                      ///< [out] Pointer to the output aggregate
-    OffsetT                 num_items,                  ///< [in] Total number of input data items
-    ReductionOpT            reduction_op,               ///< [in] Binary reduction functor
-    OutputT                  init)                       ///< [in] The initial value of the reduction
-{
-    // Thread block type for reducing input tiles
-    typedef AgentReduce<
-            typename ChainedPolicyT::ActivePolicy::SingleTilePolicy,
-            InputIteratorT,
-            OutputIteratorT,
-            OffsetT,
-            ReductionOpT>
-        AgentReduceT;
-
-    // Shared memory storage
-    __shared__ typename AgentReduceT::TempStorage temp_storage;
-
-    // Check if empty problem
-    if (num_items == 0)
-    {
-        if (threadIdx.x == 0)
-            *d_out = init;
-        return;
-    }
-
-    // Consume input tiles
-    OutputT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeRange(
-        OffsetT(0),
-        num_items);
-
-    // Output result
-    if (threadIdx.x == 0)
-        *d_out = reduction_op(init, block_aggregate);
-}
-
-
-/// Normalize input iterator to segment offset
-template <typename T, typename OffsetT, typename IteratorT>
-__device__ __forceinline__
-void NormalizeReductionOutput(
-    T &/*val*/,
-    OffsetT /*base_offset*/,
-    IteratorT /*itr*/)
-{}
-
-
-/// Normalize input iterator to segment offset (specialized for arg-index)
-template <typename KeyValuePairT, typename OffsetT, typename WrappedIteratorT, typename OutputValueT>
-__device__ __forceinline__
-void NormalizeReductionOutput(
-    KeyValuePairT &val,
-    OffsetT base_offset,
-    ArgIndexInputIterator<WrappedIteratorT, OffsetT, OutputValueT> /*itr*/)
-{
-    val.key -= base_offset;
-}
-
-
-/**
- * Segmented reduction (one block per segment)
- */
-template <
-    typename                ChainedPolicyT,             ///< Chained tuning policy
-    typename                InputIteratorT,             ///< Random-access input iterator type for reading input items \iterator
-    typename                OutputIteratorT,            ///< Output iterator type for recording the reduced aggregate \iterator
-    typename                OffsetIteratorT,            ///< Random-access input iterator type for reading segment offsets \iterator
-    typename                OffsetT,                    ///< Signed integer type for global offsets
-    typename                ReductionOpT,               ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
-    typename                OutputT>                    ///< Data element type that is convertible to the \p value type of \p OutputIteratorT
-__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS))
-__global__ void DeviceSegmentedReduceKernel(
-    InputIteratorT          d_in,                       ///< [in] Pointer to the input sequence of data items
-    OutputIteratorT         d_out,                      ///< [out] Pointer to the output aggregate
-    OffsetIteratorT         d_begin_offsets,            ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-    OffsetIteratorT         d_end_offsets,              ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-    int                     /*num_segments*/,           ///< [in] The number of segments that comprise the sorting data
-    ReductionOpT            reduction_op,               ///< [in] Binary reduction functor 
-    OutputT                 init)                       ///< [in] The initial value of the reduction
-{
-    // Thread block type for reducing input tiles
-    typedef AgentReduce<
-            typename ChainedPolicyT::ActivePolicy::ReducePolicy,
-            InputIteratorT,
-            OutputIteratorT,
-            OffsetT,
-            ReductionOpT>
-        AgentReduceT;
-
-    // Shared memory storage
-    __shared__ typename AgentReduceT::TempStorage temp_storage;
-
-    OffsetT segment_begin   = d_begin_offsets[blockIdx.x];
-    OffsetT segment_end     = d_end_offsets[blockIdx.x];
-
-    // Check if empty problem
-    if (segment_begin == segment_end)
-    {
-        if (threadIdx.x == 0)
-            d_out[blockIdx.x] = init;
-        return;
-    }
-
-    // Consume input tiles
-    OutputT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeRange(
-        segment_begin,
-        segment_end);
-
-    // Normalize as needed
-    NormalizeReductionOutput(block_aggregate, segment_begin, d_in);
-
-    if (threadIdx.x == 0)
-        d_out[blockIdx.x] = reduction_op(init, block_aggregate);;
-}
-
-
-
-
-/******************************************************************************
- * Policy
- ******************************************************************************/
-
-template <
-    typename OutputT,            ///< Data type
-    typename OffsetT,           ///< Signed integer type for global offsets
-    typename ReductionOpT>      ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt> 
-struct DeviceReducePolicy
-{
-    //------------------------------------------------------------------------------
-    // Architecture-specific tuning policies
-    //------------------------------------------------------------------------------
-
-    /// SM13
-    struct Policy130 : ChainedPolicy<130, Policy130, Policy130>
-    {
-        // ReducePolicy
-        typedef AgentReducePolicy<
-                CUB_SCALED_GRANULARITIES(128, 8, OutputT), ///< Threads per block, items per thread
-                2,                                         ///< Number of items per vectorized load
-                BLOCK_REDUCE_RAKING,                       ///< Cooperative block-wide reduction algorithm to use
-                LOAD_DEFAULT>                              ///< Cache load modifier
-            ReducePolicy;
-
-        // SingleTilePolicy
-        typedef ReducePolicy SingleTilePolicy;
-
-        // SegmentedReducePolicy
-        typedef ReducePolicy SegmentedReducePolicy;
-    };
-
-
-    /// SM20
-    struct Policy200 : ChainedPolicy<200, Policy200, Policy130>
-    {
-        // ReducePolicy (GTX 580: 178.9 GB/s @ 48M 4B items, 158.1 GB/s @ 192M 1B items)
-        typedef AgentReducePolicy<
-                CUB_SCALED_GRANULARITIES(128, 8, OutputT), ///< Threads per block, items per thread
-                4,                                         ///< Number of items per vectorized load
-                BLOCK_REDUCE_RAKING,                       ///< Cooperative block-wide reduction algorithm to use
-                LOAD_DEFAULT>                              ///< Cache load modifier
-            ReducePolicy;
-
-        // SingleTilePolicy
-        typedef ReducePolicy SingleTilePolicy;
-
-        // SegmentedReducePolicy
-        typedef ReducePolicy SegmentedReducePolicy;
-    };
-
-
-    /// SM30
-    struct Policy300 : ChainedPolicy<300, Policy300, Policy200>
-    {
-        // ReducePolicy (GTX670: 154.0 @ 48M 4B items)
-        typedef AgentReducePolicy<
-                CUB_SCALED_GRANULARITIES(256, 20, OutputT), ///< Threads per block, items per thread
-                2,                                          ///< Number of items per vectorized load
-                BLOCK_REDUCE_WARP_REDUCTIONS,               ///< Cooperative block-wide reduction algorithm to use
-                LOAD_DEFAULT>                               ///< Cache load modifier
-            ReducePolicy;
-
-        // SingleTilePolicy
-        typedef ReducePolicy SingleTilePolicy;
-
-        // SegmentedReducePolicy
-        typedef ReducePolicy SegmentedReducePolicy;
-    };
-
-
-    /// SM35
-    struct Policy350 : ChainedPolicy<350, Policy350, Policy300>
-    {
-        // ReducePolicy (GTX Titan: 255.1 GB/s @ 48M 4B items; 228.7 GB/s @ 192M 1B items)
-        typedef AgentReducePolicy<
-                CUB_SCALED_GRANULARITIES(256, 20, OutputT), ///< Threads per block, items per thread
-                4,                                          ///< Number of items per vectorized load
-                BLOCK_REDUCE_WARP_REDUCTIONS,               ///< Cooperative block-wide reduction algorithm to use
-                LOAD_LDG>                                   ///< Cache load modifier
-            ReducePolicy;
-
-        // SingleTilePolicy
-        typedef ReducePolicy SingleTilePolicy;
-
-        // SegmentedReducePolicy
-        typedef ReducePolicy SegmentedReducePolicy;
-    };
-
-    /// SM60
-    struct Policy600 : ChainedPolicy<600, Policy600, Policy350>
-    {
-        // ReducePolicy (P100: 591 GB/s @ 64M 4B items; 583 GB/s @ 256M 1B items)
-        typedef AgentReducePolicy<
-                CUB_SCALED_GRANULARITIES(256, 16, OutputT), ///< Threads per block, items per thread
-                4,                                          ///< Number of items per vectorized load
-                BLOCK_REDUCE_WARP_REDUCTIONS,               ///< Cooperative block-wide reduction algorithm to use
-                LOAD_LDG>                                   ///< Cache load modifier
-            ReducePolicy;
-
-        // SingleTilePolicy
-        typedef ReducePolicy SingleTilePolicy;
-
-        // SegmentedReducePolicy
-        typedef ReducePolicy SegmentedReducePolicy;
-    };
-
-
-    /// MaxPolicy
-    typedef Policy600 MaxPolicy;
-
-};
-
-
-
-/******************************************************************************
- * Single-problem dispatch
- ******************************************************************************/
-
-/**
- * Utility class for dispatching the appropriately-tuned kernels for device-wide reduction
- */
-template <
-    typename InputIteratorT,    ///< Random-access input iterator type for reading input items \iterator
-    typename OutputIteratorT,   ///< Output iterator type for recording the reduced aggregate \iterator
-    typename OffsetT,           ///< Signed integer type for global offsets
-    typename ReductionOpT,      ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt> 
-    typename OutputT =          ///< Data type of the output iterator
-        typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-            typename std::iterator_traits<InputIteratorT>::value_type,                                  // ... then the input iterator's value type,
-            typename std::iterator_traits<OutputIteratorT>::value_type>::Type>                          // ... else the output iterator's value type
-struct DispatchReduce : DeviceReducePolicy<OutputT, OffsetT, ReductionOpT>
-{
-    //------------------------------------------------------------------------------
-    // Problem state
-    //------------------------------------------------------------------------------
-
-    void                *d_temp_storage;                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-    size_t              &temp_storage_bytes;            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-    InputIteratorT      d_in;                           ///< [in] Pointer to the input sequence of data items
-    OutputIteratorT     d_out;                          ///< [out] Pointer to the output aggregate
-    OffsetT             num_items;                      ///< [in] Total number of input items (i.e., length of \p d_in)
-    ReductionOpT        reduction_op;                   ///< [in] Binary reduction functor 
-    OutputT             init;                           ///< [in] The initial value of the reduction
-    cudaStream_t        stream;                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-    bool                debug_synchronous;              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    int                 ptx_version;                    ///< [in] PTX version
-
-    //------------------------------------------------------------------------------
-    // Constructor
-    //------------------------------------------------------------------------------
-
-    /// Constructor
-    CUB_RUNTIME_FUNCTION __forceinline__
-    DispatchReduce(
-        void*                   d_temp_storage,
-        size_t                  &temp_storage_bytes,
-        InputIteratorT          d_in,
-        OutputIteratorT         d_out,
-        OffsetT                 num_items,
-        ReductionOpT            reduction_op,
-        OutputT                 init,
-        cudaStream_t            stream,
-        bool                    debug_synchronous,
-        int                     ptx_version)
-    :
-        d_temp_storage(d_temp_storage),
-        temp_storage_bytes(temp_storage_bytes),
-        d_in(d_in),
-        d_out(d_out),
-        num_items(num_items),
-        reduction_op(reduction_op),
-        init(init),
-        stream(stream),
-        debug_synchronous(debug_synchronous),
-        ptx_version(ptx_version)
-    {}
-
-
-    //------------------------------------------------------------------------------
-    // Small-problem (single tile) invocation
-    //------------------------------------------------------------------------------
-
-    /// Invoke a single block block to reduce in-core
-    template <
-        typename                ActivePolicyT,          ///< Umbrella policy active for the target device
-        typename                SingleTileKernelT>      ///< Function type of cub::DeviceReduceSingleTileKernel
-    CUB_RUNTIME_FUNCTION __forceinline__
-    cudaError_t InvokeSingleTile(
-        SingleTileKernelT       single_tile_kernel)     ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceSingleTileKernel
-    {
-#ifndef CUB_RUNTIME_ENABLED
-        (void)single_tile_kernel;
-
-        // Kernel launch not supported from this device
-        return CubDebug(cudaErrorNotSupported );
-#else
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Return if the caller is simply requesting the size of the storage allocation
-            if (d_temp_storage == NULL)
-            {
-                temp_storage_bytes = 1;
-                break;
-            }
-
-            // Log single_reduce_sweep_kernel configuration
-            if (debug_synchronous) _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), %d items per thread\n",
-                ActivePolicyT::SingleTilePolicy::BLOCK_THREADS,
-                (long long) stream,
-                ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD);
-
-            // Invoke single_reduce_sweep_kernel
-            single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>(
-                d_in,
-                d_out,
-                num_items,
-                reduction_op,
-                init);
-
-            // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
-
-            // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-        }
-        while (0);
-
-        return error;
-
-#endif // CUB_RUNTIME_ENABLED
-    }
-
-
-    //------------------------------------------------------------------------------
-    // Normal problem size invocation (two-pass)
-    //------------------------------------------------------------------------------
-
-    /// Invoke two-passes to reduce
-    template <
-        typename                ActivePolicyT,              ///< Umbrella policy active for the target device
-        typename                ReduceKernelT,              ///< Function type of cub::DeviceReduceKernel
-        typename                SingleTileKernelT>          ///< Function type of cub::DeviceReduceSingleTileKernel
-    CUB_RUNTIME_FUNCTION __forceinline__
-    cudaError_t InvokePasses(
-        ReduceKernelT           reduce_kernel,          ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceKernel
-        SingleTileKernelT       single_tile_kernel)     ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceSingleTileKernel
-    {
-#ifndef CUB_RUNTIME_ENABLED
-        (void)                  reduce_kernel;
-        (void)                  single_tile_kernel;
-
-        // Kernel launch not supported from this device
-        return CubDebug(cudaErrorNotSupported );
-#else
-
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get device ordinal
-            int device_ordinal;
-            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
-
-            // Get SM count
-            int sm_count;
-            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
-
-            // Init regular kernel configuration
-            KernelConfig reduce_config;
-            if (CubDebug(error = reduce_config.Init<typename ActivePolicyT::ReducePolicy>(reduce_kernel))) break;
-            int reduce_device_occupancy = reduce_config.sm_occupancy * sm_count;
-
-            // Even-share work distribution
-            int max_blocks = reduce_device_occupancy * CUB_SUBSCRIPTION_FACTOR(ptx_version);
-            GridEvenShare<OffsetT> even_share;
-            even_share.DispatchInit(num_items, max_blocks, reduce_config.tile_size);
-
-            // Temporary storage allocation requirements
-            void* allocations[1];
-            size_t allocation_sizes[1] =
-            {
-                max_blocks * sizeof(OutputT)    // bytes needed for privatized block reductions
-            };
-
-            // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
-            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
-            if (d_temp_storage == NULL)
-            {
-                // Return if the caller is simply requesting the size of the storage allocation
-                return cudaSuccess;
-            }
-
-            // Alias the allocation for the privatized per-block reductions
-            OutputT *d_block_reductions = (OutputT*) allocations[0];
-
-            // Get grid size for device_reduce_sweep_kernel
-            int reduce_grid_size = even_share.grid_size;
-
-            // Log device_reduce_sweep_kernel configuration
-            if (debug_synchronous) _CubLog("Invoking DeviceReduceKernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
-                reduce_grid_size,
-                ActivePolicyT::ReducePolicy::BLOCK_THREADS,
-                (long long) stream,
-                ActivePolicyT::ReducePolicy::ITEMS_PER_THREAD,
-                reduce_config.sm_occupancy);
-
-            // Invoke DeviceReduceKernel
-            reduce_kernel<<<reduce_grid_size, ActivePolicyT::ReducePolicy::BLOCK_THREADS, 0, stream>>>(
-                d_in,
-                d_block_reductions,
-                num_items,
-                even_share,
-                reduction_op);
-
-            // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
-
-            // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-
-            // Log single_reduce_sweep_kernel configuration
-            if (debug_synchronous) _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), %d items per thread\n",
-                ActivePolicyT::SingleTilePolicy::BLOCK_THREADS,
-                (long long) stream,
-                ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD);
-
-            // Invoke DeviceReduceSingleTileKernel
-            single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>(
-                d_block_reductions,
-                d_out,
-                reduce_grid_size,
-                reduction_op,
-                init);
-
-            // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
-
-            // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-        }
-        while (0);
-
-        return error;
-
-#endif // CUB_RUNTIME_ENABLED
-
-    }
-
-
-    //------------------------------------------------------------------------------
-    // Chained policy invocation
-    //------------------------------------------------------------------------------
-
-    /// Invocation
-    template <typename ActivePolicyT>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    cudaError_t Invoke()
-    {
-        typedef typename ActivePolicyT::SingleTilePolicy    SingleTilePolicyT;
-        typedef typename DispatchReduce::MaxPolicy          MaxPolicyT;
-
-        // Force kernel code-generation in all compiler passes
-        if (num_items <= (SingleTilePolicyT::BLOCK_THREADS * SingleTilePolicyT::ITEMS_PER_THREAD))
-        {
-            // Small, single tile size
-            return InvokeSingleTile<ActivePolicyT>(
-                DeviceReduceSingleTileKernel<MaxPolicyT, InputIteratorT, OutputIteratorT, OffsetT, ReductionOpT, OutputT>);
-        }
-        else
-        {
-            // Regular size
-            return InvokePasses<ActivePolicyT>(
-                DeviceReduceKernel<typename DispatchReduce::MaxPolicy, InputIteratorT, OutputT*, OffsetT, ReductionOpT>,
-                DeviceReduceSingleTileKernel<MaxPolicyT, OutputT*, OutputIteratorT, OffsetT, ReductionOpT, OutputT>);
-        }
-    }
-
-
-    //------------------------------------------------------------------------------
-    // Dispatch entrypoints
-    //------------------------------------------------------------------------------
-
-    /**
-     * Internal dispatch routine for computing a device-wide reduction
-     */
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Dispatch(
-        void            *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t          &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT  d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT d_out,                              ///< [out] Pointer to the output aggregate
-        OffsetT         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
-        ReductionOpT    reduction_op,                       ///< [in] Binary reduction functor 
-        OutputT         init,                               ///< [in] The initial value of the reduction
-        cudaStream_t    stream,                             ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool            debug_synchronous)                  ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        typedef typename DispatchReduce::MaxPolicy MaxPolicyT;
-
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get PTX version
-            int ptx_version;
-            if (CubDebug(error = PtxVersion(ptx_version))) break;
-
-            // Create dispatch functor
-            DispatchReduce dispatch(
-                d_temp_storage, temp_storage_bytes,
-                d_in, d_out, num_items, reduction_op, init,
-                stream, debug_synchronous, ptx_version);
-
-            // Dispatch to chained policy
-            if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;
-        }
-        while (0);
-
-        return error;
-    }
-};
-
-
-
-/******************************************************************************
- * Segmented dispatch
- ******************************************************************************/
-
-/**
- * Utility class for dispatching the appropriately-tuned kernels for device-wide reduction
- */
-template <
-    typename InputIteratorT,    ///< Random-access input iterator type for reading input items \iterator
-    typename OutputIteratorT,   ///< Output iterator type for recording the reduced aggregate \iterator
-    typename OffsetIteratorT,   ///< Random-access input iterator type for reading segment offsets \iterator
-    typename OffsetT,           ///< Signed integer type for global offsets
-    typename ReductionOpT,      ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt> 
-    typename OutputT =          ///< Data type of the output iterator
-        typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-            typename std::iterator_traits<InputIteratorT>::value_type,                                  // ... then the input iterator's value type,
-            typename std::iterator_traits<OutputIteratorT>::value_type>::Type>                          // ... else the output iterator's value type
-struct DispatchSegmentedReduce :
-    DeviceReducePolicy<
-        typename std::iterator_traits<InputIteratorT>::value_type,
-        OffsetT,
-        ReductionOpT>
-{
-    //------------------------------------------------------------------------------
-    // Problem state
-    //------------------------------------------------------------------------------
-
-    void                *d_temp_storage;        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-    size_t              &temp_storage_bytes;    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-    InputIteratorT      d_in;                   ///< [in] Pointer to the input sequence of data items
-    OutputIteratorT     d_out;                  ///< [out] Pointer to the output aggregate
-    OffsetT             num_segments;           ///< [in] The number of segments that comprise the sorting data
-    OffsetIteratorT     d_begin_offsets;        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-    OffsetIteratorT     d_end_offsets;          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-    ReductionOpT        reduction_op;           ///< [in] Binary reduction functor 
-    OutputT             init;                   ///< [in] The initial value of the reduction
-    cudaStream_t        stream;                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-    bool                debug_synchronous;      ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    int                 ptx_version;            ///< [in] PTX version
-
-    //------------------------------------------------------------------------------
-    // Constructor
-    //------------------------------------------------------------------------------
-
-    /// Constructor
-    CUB_RUNTIME_FUNCTION __forceinline__
-    DispatchSegmentedReduce(
-        void*                   d_temp_storage,
-        size_t                  &temp_storage_bytes,
-        InputIteratorT          d_in,
-        OutputIteratorT         d_out,
-        OffsetT                 num_segments,
-        OffsetIteratorT         d_begin_offsets,
-        OffsetIteratorT         d_end_offsets,
-        ReductionOpT            reduction_op,
-        OutputT                 init,
-        cudaStream_t            stream,
-        bool                    debug_synchronous,
-        int                     ptx_version)
-    :
-        d_temp_storage(d_temp_storage),
-        temp_storage_bytes(temp_storage_bytes),
-        d_in(d_in),
-        d_out(d_out),
-        num_segments(num_segments),
-        d_begin_offsets(d_begin_offsets),
-        d_end_offsets(d_end_offsets),
-        reduction_op(reduction_op),
-        init(init),
-        stream(stream),
-        debug_synchronous(debug_synchronous),
-        ptx_version(ptx_version)
-    {}
-
-
-
-    //------------------------------------------------------------------------------
-    // Chained policy invocation
-    //------------------------------------------------------------------------------
-
-    /// Invocation
-    template <
-        typename                        ActivePolicyT,                  ///< Umbrella policy active for the target device
-        typename                        DeviceSegmentedReduceKernelT>   ///< Function type of cub::DeviceSegmentedReduceKernel
-    CUB_RUNTIME_FUNCTION __forceinline__
-    cudaError_t InvokePasses(
-        DeviceSegmentedReduceKernelT    segmented_reduce_kernel)        ///< [in] Kernel function pointer to parameterization of cub::DeviceSegmentedReduceKernel
-    {
-#ifndef CUB_RUNTIME_ENABLED
-        (void)segmented_reduce_kernel;
-        // Kernel launch not supported from this device
-        return CubDebug(cudaErrorNotSupported );
-#else
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Return if the caller is simply requesting the size of the storage allocation
-            if (d_temp_storage == NULL)
-            {
-                temp_storage_bytes = 1;
-                return cudaSuccess;
-            }
-
-            // Init kernel configuration
-            KernelConfig segmented_reduce_config;
-            if (CubDebug(error = segmented_reduce_config.Init<typename ActivePolicyT::SegmentedReducePolicy>(segmented_reduce_kernel))) break;
-
-            // Log device_reduce_sweep_kernel configuration
-            if (debug_synchronous) _CubLog("Invoking SegmentedDeviceReduceKernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
-                num_segments,
-                ActivePolicyT::SegmentedReducePolicy::BLOCK_THREADS,
-                (long long) stream,
-                ActivePolicyT::SegmentedReducePolicy::ITEMS_PER_THREAD,
-                segmented_reduce_config.sm_occupancy);
-
-            // Invoke DeviceReduceKernel
-            segmented_reduce_kernel<<<num_segments, ActivePolicyT::SegmentedReducePolicy::BLOCK_THREADS, 0, stream>>>(
-                d_in,
-                d_out,
-                d_begin_offsets,
-                d_end_offsets,
-                num_segments,
-                reduction_op,
-                init);
-
-            // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
-
-            // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-        }
-        while (0);
-
-        return error;
-
-#endif // CUB_RUNTIME_ENABLED
-
-    }
-
-
-    /// Invocation
-    template <typename ActivePolicyT>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    cudaError_t Invoke()
-    {
-        typedef typename DispatchSegmentedReduce::MaxPolicy MaxPolicyT;
-
-        // Force kernel code-generation in all compiler passes
-        return InvokePasses<ActivePolicyT>(
-            DeviceSegmentedReduceKernel<MaxPolicyT, InputIteratorT, OutputIteratorT, OffsetIteratorT, OffsetT, ReductionOpT, OutputT>);
-    }
-
-
-    //------------------------------------------------------------------------------
-    // Dispatch entrypoints
-    //------------------------------------------------------------------------------
-
-    /**
-     * Internal dispatch routine for computing a device-wide reduction
-     */
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Dispatch(
-        void            *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t          &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT  d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT d_out,                              ///< [out] Pointer to the output aggregate
-        int             num_segments,                       ///< [in] The number of segments that comprise the sorting data
-        OffsetIteratorT d_begin_offsets,                    ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-        OffsetIteratorT d_end_offsets,                      ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-        ReductionOpT    reduction_op,                       ///< [in] Binary reduction functor 
-        OutputT         init,                               ///< [in] The initial value of the reduction
-        cudaStream_t    stream,                             ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool            debug_synchronous)                  ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        typedef typename DispatchSegmentedReduce::MaxPolicy MaxPolicyT;
-
-        if (num_segments <= 0)
-            return cudaSuccess;
-
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get PTX version
-            int ptx_version;
-            if (CubDebug(error = PtxVersion(ptx_version))) break;
-
-            // Create dispatch functor
-            DispatchSegmentedReduce dispatch(
-                d_temp_storage, temp_storage_bytes,
-                d_in, d_out,
-                num_segments, d_begin_offsets, d_end_offsets,
-                reduction_op, init,
-                stream, debug_synchronous, ptx_version);
-
-            // Dispatch to chained policy
-            if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;
-        }
-        while (0);
-
-        return error;
-    }
-};
-
-
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce_by_key.cuh b/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce_by_key.cuh
deleted file mode 100644
index 38bee414e..000000000
--- a/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce_by_key.cuh
+++ /dev/null
@@ -1,554 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DeviceReduceByKey provides device-wide, parallel operations for reducing segments of values residing within device-accessible memory.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-
-#include "dispatch_scan.cuh"
-#include "../../agent/agent_reduce_by_key.cuh"
-#include "../../thread/thread_operators.cuh"
-#include "../../grid/grid_queue.cuh"
-#include "../../util_device.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/******************************************************************************
- * Kernel entry points
- *****************************************************************************/
-
-/**
- * Multi-block reduce-by-key sweep kernel entry point
- */
-template <
-    typename            AgentReduceByKeyPolicyT,                 ///< Parameterized AgentReduceByKeyPolicyT tuning policy type
-    typename            KeysInputIteratorT,                     ///< Random-access input iterator type for keys
-    typename            UniqueOutputIteratorT,                  ///< Random-access output iterator type for keys
-    typename            ValuesInputIteratorT,                   ///< Random-access input iterator type for values
-    typename            AggregatesOutputIteratorT,              ///< Random-access output iterator type for values
-    typename            NumRunsOutputIteratorT,                 ///< Output iterator type for recording number of segments encountered
-    typename            ScanTileStateT,                         ///< Tile status interface type
-    typename            EqualityOpT,                            ///< KeyT equality operator type
-    typename            ReductionOpT,                           ///< ValueT reduction operator type
-    typename            OffsetT>                                ///< Signed integer type for global offsets
-__launch_bounds__ (int(AgentReduceByKeyPolicyT::BLOCK_THREADS))
-__global__ void DeviceReduceByKeyKernel(
-    KeysInputIteratorT          d_keys_in,                      ///< Pointer to the input sequence of keys
-    UniqueOutputIteratorT       d_unique_out,                   ///< Pointer to the output sequence of unique keys (one key per run)
-    ValuesInputIteratorT        d_values_in,                    ///< Pointer to the input sequence of corresponding values
-    AggregatesOutputIteratorT   d_aggregates_out,               ///< Pointer to the output sequence of value aggregates (one aggregate per run)
-    NumRunsOutputIteratorT      d_num_runs_out,                 ///< Pointer to total number of runs encountered (i.e., the length of d_unique_out)
-    ScanTileStateT              tile_state,                     ///< Tile status interface
-    int                         start_tile,                     ///< The starting tile for the current grid
-    EqualityOpT                 equality_op,                    ///< KeyT equality operator
-    ReductionOpT                reduction_op,                   ///< ValueT reduction operator
-    OffsetT                     num_items)                      ///< Total number of items to select from
-{
-    // Thread block type for reducing tiles of value segments
-    typedef AgentReduceByKey<
-            AgentReduceByKeyPolicyT,
-            KeysInputIteratorT,
-            UniqueOutputIteratorT,
-            ValuesInputIteratorT,
-            AggregatesOutputIteratorT,
-            NumRunsOutputIteratorT,
-            EqualityOpT,
-            ReductionOpT,
-            OffsetT>
-        AgentReduceByKeyT;
-
-    // Shared memory for AgentReduceByKey
-    __shared__ typename AgentReduceByKeyT::TempStorage temp_storage;
-
-    // Process tiles
-    AgentReduceByKeyT(temp_storage, d_keys_in, d_unique_out, d_values_in, d_aggregates_out, d_num_runs_out, equality_op, reduction_op).ConsumeRange(
-        num_items,
-        tile_state,
-        start_tile);
-}
-
-
-
-
-/******************************************************************************
- * Dispatch
- ******************************************************************************/
-
-/**
- * Utility class for dispatching the appropriately-tuned kernels for DeviceReduceByKey
- */
-template <
-    typename    KeysInputIteratorT,         ///< Random-access input iterator type for keys
-    typename    UniqueOutputIteratorT,      ///< Random-access output iterator type for keys
-    typename    ValuesInputIteratorT,       ///< Random-access input iterator type for values
-    typename    AggregatesOutputIteratorT,  ///< Random-access output iterator type for values
-    typename    NumRunsOutputIteratorT,     ///< Output iterator type for recording number of segments encountered
-    typename    EqualityOpT,                ///< KeyT equality operator type
-    typename    ReductionOpT,               ///< ValueT reduction operator type
-    typename    OffsetT>                    ///< Signed integer type for global offsets
-struct DispatchReduceByKey
-{
-    //-------------------------------------------------------------------------
-    // Types and constants
-    //-------------------------------------------------------------------------
-
-    // The input keys type
-    typedef typename std::iterator_traits<KeysInputIteratorT>::value_type KeyInputT;
-
-    // The output keys type
-    typedef typename If<(Equals<typename std::iterator_traits<UniqueOutputIteratorT>::value_type, void>::VALUE),    // KeyOutputT =  (if output iterator's value type is void) ?
-        typename std::iterator_traits<KeysInputIteratorT>::value_type,                                              // ... then the input iterator's value type,
-        typename std::iterator_traits<UniqueOutputIteratorT>::value_type>::Type KeyOutputT;                         // ... else the output iterator's value type
-
-    // The input values type
-    typedef typename std::iterator_traits<ValuesInputIteratorT>::value_type ValueInputT;
-
-    // The output values type
-    typedef typename If<(Equals<typename std::iterator_traits<AggregatesOutputIteratorT>::value_type, void>::VALUE),    // ValueOutputT =  (if output iterator's value type is void) ?
-        typename std::iterator_traits<ValuesInputIteratorT>::value_type,                                                // ... then the input iterator's value type,
-        typename std::iterator_traits<AggregatesOutputIteratorT>::value_type>::Type ValueOutputT;                       // ... else the output iterator's value type
-
-    enum
-    {
-        INIT_KERNEL_THREADS     = 128,
-        MAX_INPUT_BYTES         = CUB_MAX(sizeof(KeyOutputT), sizeof(ValueOutputT)),
-        COMBINED_INPUT_BYTES    = sizeof(KeyOutputT) + sizeof(ValueOutputT),
-    };
-
-    // Tile status descriptor interface type
-    typedef ReduceByKeyScanTileState<ValueOutputT, OffsetT> ScanTileStateT;
-
-
-    //-------------------------------------------------------------------------
-    // Tuning policies
-    //-------------------------------------------------------------------------
-
-    /// SM35
-    struct Policy350
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 6,
-            ITEMS_PER_THREAD            = (MAX_INPUT_BYTES <= 8) ? 6 : CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)),
-        };
-
-        typedef AgentReduceByKeyPolicy<
-                128,
-                ITEMS_PER_THREAD,
-                BLOCK_LOAD_DIRECT,
-                LOAD_LDG,
-                BLOCK_SCAN_WARP_SCANS>
-            ReduceByKeyPolicyT;
-    };
-
-    /// SM30
-    struct Policy300
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 6,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)),
-        };
-
-        typedef AgentReduceByKeyPolicy<
-                128,
-                ITEMS_PER_THREAD,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                BLOCK_SCAN_WARP_SCANS>
-            ReduceByKeyPolicyT;
-    };
-
-    /// SM20
-    struct Policy200
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 11,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)),
-        };
-
-        typedef AgentReduceByKeyPolicy<
-                128,
-                ITEMS_PER_THREAD,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                BLOCK_SCAN_WARP_SCANS>
-            ReduceByKeyPolicyT;
-    };
-
-    /// SM13
-    struct Policy130
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 7,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)),
-        };
-
-        typedef AgentReduceByKeyPolicy<
-                128,
-                ITEMS_PER_THREAD,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                BLOCK_SCAN_WARP_SCANS>
-            ReduceByKeyPolicyT;
-    };
-
-    /// SM11
-    struct Policy110
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 5,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 8) / COMBINED_INPUT_BYTES)),
-        };
-
-        typedef AgentReduceByKeyPolicy<
-                64,
-                ITEMS_PER_THREAD,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                BLOCK_SCAN_RAKING>
-            ReduceByKeyPolicyT;
-    };
-
-
-    /******************************************************************************
-     * Tuning policies of current PTX compiler pass
-     ******************************************************************************/
-
-#if (CUB_PTX_ARCH >= 350)
-    typedef Policy350 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 300)
-    typedef Policy300 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 200)
-    typedef Policy200 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 130)
-    typedef Policy130 PtxPolicy;
-
-#else
-    typedef Policy110 PtxPolicy;
-
-#endif
-
-    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
-    struct PtxReduceByKeyPolicy : PtxPolicy::ReduceByKeyPolicyT {};
-
-
-    /******************************************************************************
-     * Utilities
-     ******************************************************************************/
-
-    /**
-     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
-     */
-    template <typename KernelConfig>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static void InitConfigs(
-        int             ptx_version,
-        KernelConfig    &reduce_by_key_config)
-    {
-    #if (CUB_PTX_ARCH > 0)
-        (void)ptx_version;
-
-        // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
-        reduce_by_key_config.template Init<PtxReduceByKeyPolicy>();
-
-    #else
-
-        // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
-        if (ptx_version >= 350)
-        {
-            reduce_by_key_config.template Init<typename Policy350::ReduceByKeyPolicyT>();
-        }
-        else if (ptx_version >= 300)
-        {
-            reduce_by_key_config.template Init<typename Policy300::ReduceByKeyPolicyT>();
-        }
-        else if (ptx_version >= 200)
-        {
-            reduce_by_key_config.template Init<typename Policy200::ReduceByKeyPolicyT>();
-        }
-        else if (ptx_version >= 130)
-        {
-            reduce_by_key_config.template Init<typename Policy130::ReduceByKeyPolicyT>();
-        }
-        else
-        {
-            reduce_by_key_config.template Init<typename Policy110::ReduceByKeyPolicyT>();
-        }
-
-    #endif
-    }
-
-
-    /**
-     * Kernel kernel dispatch configuration.
-     */
-    struct KernelConfig
-    {
-        int block_threads;
-        int items_per_thread;
-        int tile_items;
-
-        template <typename PolicyT>
-        CUB_RUNTIME_FUNCTION __forceinline__
-        void Init()
-        {
-            block_threads       = PolicyT::BLOCK_THREADS;
-            items_per_thread    = PolicyT::ITEMS_PER_THREAD;
-            tile_items          = block_threads * items_per_thread;
-        }
-    };
-
-
-    //---------------------------------------------------------------------
-    // Dispatch entrypoints
-    //---------------------------------------------------------------------
-
-    /**
-     * Internal dispatch routine for computing a device-wide reduce-by-key using the
-     * specified kernel functions.
-     */
-    template <
-        typename                    ScanInitKernelT,         ///< Function type of cub::DeviceScanInitKernel
-        typename                    ReduceByKeyKernelT>      ///< Function type of cub::DeviceReduceByKeyKernelT
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Dispatch(
-        void*                       d_temp_storage,             ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&                     temp_storage_bytes,         ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        KeysInputIteratorT          d_keys_in,                  ///< [in] Pointer to the input sequence of keys
-        UniqueOutputIteratorT       d_unique_out,               ///< [out] Pointer to the output sequence of unique keys (one key per run)
-        ValuesInputIteratorT        d_values_in,                ///< [in] Pointer to the input sequence of corresponding values
-        AggregatesOutputIteratorT   d_aggregates_out,           ///< [out] Pointer to the output sequence of value aggregates (one aggregate per run)
-        NumRunsOutputIteratorT      d_num_runs_out,             ///< [out] Pointer to total number of runs encountered (i.e., the length of d_unique_out)
-        EqualityOpT                 equality_op,                ///< [in] KeyT equality operator
-        ReductionOpT                reduction_op,               ///< [in] ValueT reduction operator
-        OffsetT                     num_items,                  ///< [in] Total number of items to select from
-        cudaStream_t                stream,                     ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous,          ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-        int                         /*ptx_version*/,            ///< [in] PTX version of dispatch kernels
-        ScanInitKernelT                init_kernel,                ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel
-        ReduceByKeyKernelT             reduce_by_key_kernel,       ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceByKeyKernel
-        KernelConfig                reduce_by_key_config)       ///< [in] Dispatch parameters that match the policy that \p reduce_by_key_kernel was compiled for
-    {
-
-#ifndef CUB_RUNTIME_ENABLED
-      (void)d_temp_storage;
-      (void)temp_storage_bytes;
-      (void)d_keys_in;
-      (void)d_unique_out;
-      (void)d_values_in;
-      (void)d_aggregates_out;
-      (void)d_num_runs_out;
-      (void)equality_op;
-      (void)reduction_op;
-      (void)num_items;
-      (void)stream;
-      (void)debug_synchronous;
-      (void)init_kernel;
-      (void)reduce_by_key_kernel;
-      (void)reduce_by_key_config;
-
-        // Kernel launch not supported from this device
-        return CubDebug(cudaErrorNotSupported);
-
-#else
-
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get device ordinal
-            int device_ordinal;
-            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
-
-            // Get SM count
-            int sm_count;
-            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
-
-            // Number of input tiles
-            int tile_size = reduce_by_key_config.block_threads * reduce_by_key_config.items_per_thread;
-            int num_tiles = (num_items + tile_size - 1) / tile_size;
-
-            // Specify temporary storage allocation requirements
-            size_t  allocation_sizes[1];
-            if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break;    // bytes needed for tile status descriptors
-
-            // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob)
-            void* allocations[1];
-            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
-            if (d_temp_storage == NULL)
-            {
-                // Return if the caller is simply requesting the size of the storage allocation
-                break;
-            }
-
-            // Construct the tile status interface
-            ScanTileStateT tile_state;
-            if (CubDebug(error = tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;
-
-            // Log init_kernel configuration
-            int init_grid_size = CUB_MAX(1, (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS);
-            if (debug_synchronous) _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
-
-            // Invoke init_kernel to initialize tile descriptors
-            init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>(
-                tile_state,
-                num_tiles,
-                d_num_runs_out);
-
-            // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
-
-            // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-
-            // Return if empty problem
-            if (num_items == 0)
-                break;
-
-            // Get SM occupancy for reduce_by_key_kernel
-            int reduce_by_key_sm_occupancy;
-            if (CubDebug(error = MaxSmOccupancy(
-                reduce_by_key_sm_occupancy,            // out
-                reduce_by_key_kernel,
-                reduce_by_key_config.block_threads))) break;
-
-            // Get max x-dimension of grid
-            int max_dim_x;
-            if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;;
-
-            // Run grids in epochs (in case number of tiles exceeds max x-dimension
-            int scan_grid_size = CUB_MIN(num_tiles, max_dim_x);
-            for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size)
-            {
-                // Log reduce_by_key_kernel configuration
-                if (debug_synchronous) _CubLog("Invoking %d reduce_by_key_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
-                    start_tile, scan_grid_size, reduce_by_key_config.block_threads, (long long) stream, reduce_by_key_config.items_per_thread, reduce_by_key_sm_occupancy);
-
-                // Invoke reduce_by_key_kernel
-                reduce_by_key_kernel<<<scan_grid_size, reduce_by_key_config.block_threads, 0, stream>>>(
-                    d_keys_in,
-                    d_unique_out,
-                    d_values_in,
-                    d_aggregates_out,
-                    d_num_runs_out,
-                    tile_state,
-                    start_tile,
-                    equality_op,
-                    reduction_op,
-                    num_items);
-
-                // Check for failure to launch
-                if (CubDebug(error = cudaPeekAtLastError())) break;
-
-                // Sync the stream if specified to flush runtime errors
-                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-            }
-        }
-        while (0);
-
-        return error;
-
-#endif  // CUB_RUNTIME_ENABLED
-    }
-
-
-    /**
-     * Internal dispatch routine
-     */
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Dispatch(
-        void*                       d_temp_storage,                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&                     temp_storage_bytes,             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        KeysInputIteratorT          d_keys_in,                      ///< [in] Pointer to the input sequence of keys
-        UniqueOutputIteratorT       d_unique_out,                   ///< [out] Pointer to the output sequence of unique keys (one key per run)
-        ValuesInputIteratorT        d_values_in,                    ///< [in] Pointer to the input sequence of corresponding values
-        AggregatesOutputIteratorT   d_aggregates_out,               ///< [out] Pointer to the output sequence of value aggregates (one aggregate per run)
-        NumRunsOutputIteratorT      d_num_runs_out,                 ///< [out] Pointer to total number of runs encountered (i.e., the length of d_unique_out)
-        EqualityOpT                 equality_op,                    ///< [in] KeyT equality operator
-        ReductionOpT                reduction_op,                   ///< [in] ValueT reduction operator
-        OffsetT                     num_items,                      ///< [in] Total number of items to select from
-        cudaStream_t                stream,                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous)              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get PTX version
-            int ptx_version;
-    #if (CUB_PTX_ARCH == 0)
-            if (CubDebug(error = PtxVersion(ptx_version))) break;
-    #else
-            ptx_version = CUB_PTX_ARCH;
-    #endif
-
-            // Get kernel kernel dispatch configurations
-            KernelConfig reduce_by_key_config;
-            InitConfigs(ptx_version, reduce_by_key_config);
-
-            // Dispatch
-            if (CubDebug(error = Dispatch(
-                d_temp_storage,
-                temp_storage_bytes,
-                d_keys_in,
-                d_unique_out,
-                d_values_in,
-                d_aggregates_out,
-                d_num_runs_out,
-                equality_op,
-                reduction_op,
-                num_items,
-                stream,
-                debug_synchronous,
-                ptx_version,
-                DeviceCompactInitKernel<ScanTileStateT, NumRunsOutputIteratorT>,
-                DeviceReduceByKeyKernel<PtxReduceByKeyPolicy, KeysInputIteratorT, UniqueOutputIteratorT, ValuesInputIteratorT, AggregatesOutputIteratorT, NumRunsOutputIteratorT, ScanTileStateT, EqualityOpT, ReductionOpT, OffsetT>,
-                reduce_by_key_config))) break;
-        }
-        while (0);
-
-        return error;
-    }
-};
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/thrust/system/cuda/detail/cub/device/dispatch/dispatch_rle.cuh b/thrust/system/cuda/detail/cub/device/dispatch/dispatch_rle.cuh
deleted file mode 100644
index 0d244a8a6..000000000
--- a/thrust/system/cuda/detail/cub/device/dispatch/dispatch_rle.cuh
+++ /dev/null
@@ -1,538 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DeviceRle provides device-wide, parallel operations for run-length-encoding sequences of data items residing within device-accessible memory.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-
-#include "dispatch_scan.cuh"
-#include "../../agent/agent_rle.cuh"
-#include "../../thread/thread_operators.cuh"
-#include "../../grid/grid_queue.cuh"
-#include "../../util_device.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Kernel entry points
- *****************************************************************************/
-
-/**
- * Select kernel entry point (multi-block)
- *
- * Performs functor-based selection if SelectOp functor type != NullType
- * Otherwise performs flag-based selection if FlagIterator's value type != NullType
- * Otherwise performs discontinuity selection (keep unique)
- */
-template <
-    typename            AgentRlePolicyT,        ///< Parameterized AgentRlePolicyT tuning policy type
-    typename            InputIteratorT,             ///< Random-access input iterator type for reading input items \iterator
-    typename            OffsetsOutputIteratorT,     ///< Random-access output iterator type for writing run-offset values \iterator
-    typename            LengthsOutputIteratorT,     ///< Random-access output iterator type for writing run-length values \iterator
-    typename            NumRunsOutputIteratorT,     ///< Output iterator type for recording the number of runs encountered \iterator
-    typename            ScanTileStateT,              ///< Tile status interface type
-    typename            EqualityOpT,                 ///< T equality operator type
-    typename            OffsetT>                    ///< Signed integer type for global offsets
-__launch_bounds__ (int(AgentRlePolicyT::BLOCK_THREADS))
-__global__ void DeviceRleSweepKernel(
-    InputIteratorT              d_in,               ///< [in] Pointer to input sequence of data items
-    OffsetsOutputIteratorT      d_offsets_out,      ///< [out] Pointer to output sequence of run-offsets
-    LengthsOutputIteratorT      d_lengths_out,      ///< [out] Pointer to output sequence of run-lengths
-    NumRunsOutputIteratorT      d_num_runs_out,     ///< [out] Pointer to total number of runs (i.e., length of \p d_offsets_out)
-    ScanTileStateT              tile_status,        ///< [in] Tile status interface
-    EqualityOpT                 equality_op,        ///< [in] Equality operator for input items
-    OffsetT                     num_items,          ///< [in] Total number of input items (i.e., length of \p d_in)
-    int                         num_tiles)          ///< [in] Total number of tiles for the entire problem
-{
-    // Thread block type for selecting data from input tiles
-    typedef AgentRle<
-        AgentRlePolicyT,
-        InputIteratorT,
-        OffsetsOutputIteratorT,
-        LengthsOutputIteratorT,
-        EqualityOpT,
-        OffsetT> AgentRleT;
-
-    // Shared memory for AgentRle
-    __shared__ typename AgentRleT::TempStorage temp_storage;
-
-    // Process tiles
-    AgentRleT(temp_storage, d_in, d_offsets_out, d_lengths_out, equality_op, num_items).ConsumeRange(
-        num_tiles,
-        tile_status,
-        d_num_runs_out);
-}
-
-
-
-
-/******************************************************************************
- * Dispatch
- ******************************************************************************/
-
-/**
- * Utility class for dispatching the appropriately-tuned kernels for DeviceRle
- */
-template <
-    typename            InputIteratorT,             ///< Random-access input iterator type for reading input items \iterator
-    typename            OffsetsOutputIteratorT,     ///< Random-access output iterator type for writing run-offset values \iterator
-    typename            LengthsOutputIteratorT,     ///< Random-access output iterator type for writing run-length values \iterator
-    typename            NumRunsOutputIteratorT,     ///< Output iterator type for recording the number of runs encountered \iterator
-    typename            EqualityOpT,                ///< T equality operator type
-    typename            OffsetT>                    ///< Signed integer type for global offsets
-struct DeviceRleDispatch
-{
-    /******************************************************************************
-     * Types and constants
-     ******************************************************************************/
-
-    // The input value type
-    typedef typename std::iterator_traits<InputIteratorT>::value_type T;
-
-    // The lengths output value type
-    typedef typename If<(Equals<typename std::iterator_traits<LengthsOutputIteratorT>::value_type, void>::VALUE),   // LengthT =  (if output iterator's value type is void) ?
-        OffsetT,                                                                                                    // ... then the OffsetT type,
-        typename std::iterator_traits<LengthsOutputIteratorT>::value_type>::Type LengthT;                           // ... else the output iterator's value type
-
-    enum
-    {
-        INIT_KERNEL_THREADS = 128,
-    };
-
-    // Tile status descriptor interface type
-    typedef ReduceByKeyScanTileState<LengthT, OffsetT> ScanTileStateT;
-
-
-    /******************************************************************************
-     * Tuning policies
-     ******************************************************************************/
-
-    /// SM35
-    struct Policy350
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 15,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
-        };
-
-        typedef AgentRlePolicy<
-                96,
-                ITEMS_PER_THREAD,
-                BLOCK_LOAD_DIRECT,
-                LOAD_LDG,
-                true,
-                BLOCK_SCAN_WARP_SCANS>
-            RleSweepPolicy;
-    };
-
-    /// SM30
-    struct Policy300
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 5,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
-        };
-
-        typedef AgentRlePolicy<
-                256,
-                ITEMS_PER_THREAD,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                true,
-                BLOCK_SCAN_RAKING_MEMOIZE>
-            RleSweepPolicy;
-    };
-
-    /// SM20
-    struct Policy200
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 15,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
-        };
-
-        typedef AgentRlePolicy<
-                128,
-                ITEMS_PER_THREAD,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                false,
-                BLOCK_SCAN_WARP_SCANS>
-            RleSweepPolicy;
-    };
-
-    /// SM13
-    struct Policy130
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 9,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
-        };
-
-        typedef AgentRlePolicy<
-                64,
-                ITEMS_PER_THREAD,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                true,
-                BLOCK_SCAN_RAKING_MEMOIZE>
-            RleSweepPolicy;
-    };
-
-    /// SM10
-    struct Policy100
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 9,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
-        };
-
-        typedef AgentRlePolicy<
-                256,
-                ITEMS_PER_THREAD,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                true,
-                BLOCK_SCAN_RAKING_MEMOIZE>
-            RleSweepPolicy;
-    };
-
-
-    /******************************************************************************
-     * Tuning policies of current PTX compiler pass
-     ******************************************************************************/
-
-#if (CUB_PTX_ARCH >= 350)
-    typedef Policy350 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 300)
-    typedef Policy300 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 200)
-    typedef Policy200 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 130)
-    typedef Policy130 PtxPolicy;
-
-#else
-    typedef Policy100 PtxPolicy;
-
-#endif
-
-    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
-    struct PtxRleSweepPolicy : PtxPolicy::RleSweepPolicy {};
-
-
-    /******************************************************************************
-     * Utilities
-     ******************************************************************************/
-
-    /**
-     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
-     */
-    template <typename KernelConfig>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static void InitConfigs(
-        int             ptx_version,
-        KernelConfig&   device_rle_config)
-    {
-    #if (CUB_PTX_ARCH > 0)
-
-        // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
-        device_rle_config.template Init<PtxRleSweepPolicy>();
-
-    #else
-
-        // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
-        if (ptx_version >= 350)
-        {
-            device_rle_config.template Init<typename Policy350::RleSweepPolicy>();
-        }
-        else if (ptx_version >= 300)
-        {
-            device_rle_config.template Init<typename Policy300::RleSweepPolicy>();
-        }
-        else if (ptx_version >= 200)
-        {
-            device_rle_config.template Init<typename Policy200::RleSweepPolicy>();
-        }
-        else if (ptx_version >= 130)
-        {
-            device_rle_config.template Init<typename Policy130::RleSweepPolicy>();
-        }
-        else
-        {
-            device_rle_config.template Init<typename Policy100::RleSweepPolicy>();
-        }
-
-    #endif
-    }
-
-
-    /**
-     * Kernel kernel dispatch configuration.  Mirrors the constants within AgentRlePolicyT.
-     */
-    struct KernelConfig
-    {
-        int                     block_threads;
-        int                     items_per_thread;
-        BlockLoadAlgorithm      load_policy;
-        bool                    store_warp_time_slicing;
-        BlockScanAlgorithm      scan_algorithm;
-
-        template <typename AgentRlePolicyT>
-        CUB_RUNTIME_FUNCTION __forceinline__
-        void Init()
-        {
-            block_threads               = AgentRlePolicyT::BLOCK_THREADS;
-            items_per_thread            = AgentRlePolicyT::ITEMS_PER_THREAD;
-            load_policy                 = AgentRlePolicyT::LOAD_ALGORITHM;
-            store_warp_time_slicing     = AgentRlePolicyT::STORE_WARP_TIME_SLICING;
-            scan_algorithm              = AgentRlePolicyT::SCAN_ALGORITHM;
-        }
-
-        CUB_RUNTIME_FUNCTION __forceinline__
-        void Print()
-        {
-            printf("%d, %d, %d, %d, %d",
-                block_threads,
-                items_per_thread,
-                load_policy,
-                store_warp_time_slicing,
-                scan_algorithm);
-        }
-    };
-
-
-    /******************************************************************************
-     * Dispatch entrypoints
-     ******************************************************************************/
-
-    /**
-     * Internal dispatch routine for computing a device-wide run-length-encode using the
-     * specified kernel functions.
-     */
-    template <
-        typename                    DeviceScanInitKernelPtr,        ///< Function type of cub::DeviceScanInitKernel
-        typename                    DeviceRleSweepKernelPtr>        ///< Function type of cub::DeviceRleSweepKernelPtr
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Dispatch(
-        void*                       d_temp_storage,                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&                     temp_storage_bytes,             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
-        OffsetsOutputIteratorT      d_offsets_out,                  ///< [out] Pointer to the output sequence of run-offsets
-        LengthsOutputIteratorT      d_lengths_out,                  ///< [out] Pointer to the output sequence of run-lengths
-        NumRunsOutputIteratorT      d_num_runs_out,                 ///< [out] Pointer to the total number of runs encountered (i.e., length of \p d_offsets_out)
-        EqualityOpT                 equality_op,                    ///< [in] Equality operator for input items
-        OffsetT                     num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
-        cudaStream_t                stream,                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous,              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-        int                         ptx_version,                    ///< [in] PTX version of dispatch kernels
-        DeviceScanInitKernelPtr     device_scan_init_kernel,        ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel
-        DeviceRleSweepKernelPtr     device_rle_sweep_kernel,        ///< [in] Kernel function pointer to parameterization of cub::DeviceRleSweepKernel
-        KernelConfig                device_rle_config)              ///< [in] Dispatch parameters that match the policy that \p device_rle_sweep_kernel was compiled for
-    {
-
-#ifndef CUB_RUNTIME_ENABLED
-
-        // Kernel launch not supported from this device
-        return CubDebug(cudaErrorNotSupported);
-
-#else
-
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get device ordinal
-            int device_ordinal;
-            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
-
-            // Get SM count
-            int sm_count;
-            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
-
-            // Number of input tiles
-            int tile_size = device_rle_config.block_threads * device_rle_config.items_per_thread;
-            int num_tiles = (num_items + tile_size - 1) / tile_size;
-
-            // Specify temporary storage allocation requirements
-            size_t  allocation_sizes[1];
-            if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break;    // bytes needed for tile status descriptors
-
-            // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob)
-            void* allocations[1];
-            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
-            if (d_temp_storage == NULL)
-            {
-                // Return if the caller is simply requesting the size of the storage allocation
-                break;
-            }
-
-            // Construct the tile status interface
-            ScanTileStateT tile_status;
-            if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;
-
-            // Log device_scan_init_kernel configuration
-            int init_grid_size = CUB_MAX(1, (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS);
-            if (debug_synchronous) _CubLog("Invoking device_scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
-
-            // Invoke device_scan_init_kernel to initialize tile descriptors and queue descriptors
-            device_scan_init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>(
-                tile_status,
-                num_tiles,
-                d_num_runs_out);
-
-            // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
-
-            // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-
-            // Return if empty problem
-            if (num_items == 0)
-                break;
-
-            // Get SM occupancy for device_rle_sweep_kernel
-            int device_rle_kernel_sm_occupancy;
-            if (CubDebug(error = MaxSmOccupancy(
-                device_rle_kernel_sm_occupancy,            // out
-                device_rle_sweep_kernel,
-                device_rle_config.block_threads))) break;
-
-            // Get max x-dimension of grid
-            int max_dim_x;
-            if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;;
-
-            // Get grid size for scanning tiles
-            dim3 scan_grid_size;
-            scan_grid_size.z = 1;
-            scan_grid_size.y = ((unsigned int) num_tiles + max_dim_x - 1) / max_dim_x;
-            scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x);
-
-            // Log device_rle_sweep_kernel configuration
-            if (debug_synchronous) _CubLog("Invoking device_rle_sweep_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
-                scan_grid_size.x, scan_grid_size.y, scan_grid_size.z, device_rle_config.block_threads, (long long) stream, device_rle_config.items_per_thread, device_rle_kernel_sm_occupancy);
-
-            // Invoke device_rle_sweep_kernel
-            device_rle_sweep_kernel<<<scan_grid_size, device_rle_config.block_threads, 0, stream>>>(
-                d_in,
-                d_offsets_out,
-                d_lengths_out,
-                d_num_runs_out,
-                tile_status,
-                equality_op,
-                num_items,
-                num_tiles);
-
-            // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
-
-            // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-
-        }
-        while (0);
-
-        return error;
-
-#endif  // CUB_RUNTIME_ENABLED
-    }
-
-
-    /**
-     * Internal dispatch routine
-     */
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Dispatch(
-        void*                       d_temp_storage,                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&                     temp_storage_bytes,             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT              d_in,                           ///< [in] Pointer to input sequence of data items
-        OffsetsOutputIteratorT      d_offsets_out,                  ///< [out] Pointer to output sequence of run-offsets
-        LengthsOutputIteratorT      d_lengths_out,                  ///< [out] Pointer to output sequence of run-lengths
-        NumRunsOutputIteratorT      d_num_runs_out,                 ///< [out] Pointer to total number of runs (i.e., length of \p d_offsets_out)
-        EqualityOpT                 equality_op,                    ///< [in] Equality operator for input items
-        OffsetT                     num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
-        cudaStream_t                stream,                         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous)              ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get PTX version
-            int ptx_version;
-    #if (CUB_PTX_ARCH == 0)
-            if (CubDebug(error = PtxVersion(ptx_version))) break;
-    #else
-            ptx_version = CUB_PTX_ARCH;
-    #endif
-
-            // Get kernel kernel dispatch configurations
-            KernelConfig device_rle_config;
-            InitConfigs(ptx_version, device_rle_config);
-
-            // Dispatch
-            if (CubDebug(error = Dispatch(
-                d_temp_storage,
-                temp_storage_bytes,
-                d_in,
-                d_offsets_out,
-                d_lengths_out,
-                d_num_runs_out,
-                equality_op,
-                num_items,
-                stream,
-                debug_synchronous,
-                ptx_version,
-                DeviceCompactInitKernel<ScanTileStateT, NumRunsOutputIteratorT>,
-                DeviceRleSweepKernel<PtxRleSweepPolicy, InputIteratorT, OffsetsOutputIteratorT, LengthsOutputIteratorT, NumRunsOutputIteratorT, ScanTileStateT, EqualityOpT, OffsetT>,
-                device_rle_config))) break;
-        }
-        while (0);
-
-        return error;
-    }
-};
-
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/thrust/system/cuda/detail/cub/device/dispatch/dispatch_scan.cuh b/thrust/system/cuda/detail/cub/device/dispatch/dispatch_scan.cuh
deleted file mode 100644
index 782e686d5..000000000
--- a/thrust/system/cuda/detail/cub/device/dispatch/dispatch_scan.cuh
+++ /dev/null
@@ -1,563 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data items residing within device-accessible memory.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-
-#include "../../agent/agent_scan.cuh"
-#include "../../thread/thread_operators.cuh"
-#include "../../grid/grid_queue.cuh"
-#include "../../util_arch.cuh"
-#include "../../util_debug.cuh"
-#include "../../util_device.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Kernel entry points
- *****************************************************************************/
-
-/**
- * Initialization kernel for tile status initialization (multi-block)
- */
-template <
-    typename            ScanTileStateT>     ///< Tile status interface type
-__global__ void DeviceScanInitKernel(
-    ScanTileStateT      tile_state,         ///< [in] Tile status interface
-    int                 num_tiles)          ///< [in] Number of tiles
-{
-    // Initialize tile status
-    tile_state.InitializeStatus(num_tiles);
-}
-
-/**
- * Initialization kernel for tile status initialization (multi-block)
- */
-template <
-    typename                ScanTileStateT,         ///< Tile status interface type
-    typename                NumSelectedIteratorT>   ///< Output iterator type for recording the number of items selected
-__global__ void DeviceCompactInitKernel(
-    ScanTileStateT          tile_state,             ///< [in] Tile status interface
-    int                     num_tiles,              ///< [in] Number of tiles
-    NumSelectedIteratorT    d_num_selected_out)     ///< [out] Pointer to the total number of items selected (i.e., length of \p d_selected_out)
-{
-    // Initialize tile status
-    tile_state.InitializeStatus(num_tiles);
-
-    // Initialize d_num_selected_out
-    if ((blockIdx.x == 0) && (threadIdx.x == 0))
-        *d_num_selected_out = 0;
-}
-
-
-/**
- * Scan kernel entry point (multi-block)
- */
-template <
-    typename            ScanPolicyT,        ///< Parameterized ScanPolicyT tuning policy type
-    typename            InputIteratorT,     ///< Random-access input iterator type for reading scan inputs \iterator
-    typename            OutputIteratorT,    ///< Random-access output iterator type for writing scan outputs \iterator
-    typename            ScanTileStateT,     ///< Tile status interface type
-    typename            ScanOpT,            ///< Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
-    typename            InitValueT,         ///< Initial value to seed the exclusive scan (cub::NullType for inclusive scans)
-    typename            OffsetT>            ///< Signed integer type for global offsets
-__launch_bounds__ (int(ScanPolicyT::BLOCK_THREADS))
-__global__ void DeviceScanKernel(
-    InputIteratorT      d_in,               ///< Input data
-    OutputIteratorT     d_out,              ///< Output data
-    ScanTileStateT      tile_state,         ///< Tile status interface
-    int                 start_tile,         ///< The starting tile for the current grid
-    ScanOpT             scan_op,            ///< Binary scan functor 
-    InitValueT          init_value,         ///< Initial value to seed the exclusive scan
-    OffsetT             num_items)          ///< Total number of scan items for the entire problem
-{
-    // Thread block type for scanning input tiles
-    typedef AgentScan<
-        ScanPolicyT,
-        InputIteratorT,
-        OutputIteratorT,
-        ScanOpT,
-        InitValueT,
-        OffsetT> AgentScanT;
-
-    // Shared memory for AgentScan
-    __shared__ typename AgentScanT::TempStorage temp_storage;
-
-    // Process tiles
-    AgentScanT(temp_storage, d_in, d_out, scan_op, init_value).ConsumeRange(
-        num_items,
-        tile_state,
-        start_tile);
-}
-
-
-
-
-/******************************************************************************
- * Dispatch
- ******************************************************************************/
-
-
-/**
- * Utility class for dispatching the appropriately-tuned kernels for DeviceScan
- */
-template <
-    typename InputIteratorT,     ///< Random-access input iterator type for reading scan inputs \iterator
-    typename OutputIteratorT,    ///< Random-access output iterator type for writing scan outputs \iterator
-    typename ScanOpT,            ///< Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
-    typename InitValueT,          ///< The init_value element type for ScanOpT (cub::NullType for inclusive scans)
-    typename OffsetT>            ///< Signed integer type for global offsets
-struct DispatchScan
-{
-    //---------------------------------------------------------------------
-    // Constants and Types
-    //---------------------------------------------------------------------
-
-    enum
-    {
-        INIT_KERNEL_THREADS = 128
-    };
-
-    // The output value type
-    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
-        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
-
-    // Tile status descriptor interface type
-    typedef ScanTileState<OutputT> ScanTileStateT;
-
-
-    //---------------------------------------------------------------------
-    // Tuning policies
-    //---------------------------------------------------------------------
-
-    /// SM600
-    struct Policy600
-    {
-        typedef AgentScanPolicy<
-            CUB_SCALED_GRANULARITIES(128, 15, OutputT),      ///< Threads per block, items per thread
-                BLOCK_LOAD_TRANSPOSE,
-                LOAD_DEFAULT,
-                BLOCK_STORE_TRANSPOSE,
-                BLOCK_SCAN_WARP_SCANS>
-            ScanPolicyT;
-    };
-
-
-    /// SM520
-    struct Policy520
-    {
-        // Titan X: 32.47B items/s @ 48M 32-bit T
-        typedef AgentScanPolicy<
-                CUB_SCALED_GRANULARITIES(128, 12, OutputT),      ///< Threads per block, items per thread
-                BLOCK_LOAD_DIRECT,
-                LOAD_LDG,
-                BLOCK_STORE_WARP_TRANSPOSE,
-                BLOCK_SCAN_WARP_SCANS>
-            ScanPolicyT;
-    };
-
-
-    /// SM35
-    struct Policy350
-    {
-        // GTX Titan: 29.5B items/s (232.4 GB/s) @ 48M 32-bit T
-        typedef AgentScanPolicy<
-                CUB_SCALED_GRANULARITIES(128, 12, OutputT),      ///< Threads per block, items per thread
-                BLOCK_LOAD_DIRECT,
-                LOAD_LDG,
-                BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED,
-                BLOCK_SCAN_RAKING>
-            ScanPolicyT;
-    };
-
-    /// SM30
-    struct Policy300
-    {
-        typedef AgentScanPolicy<
-                CUB_SCALED_GRANULARITIES(256, 9, OutputT),      ///< Threads per block, items per thread
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                BLOCK_STORE_WARP_TRANSPOSE,
-                BLOCK_SCAN_WARP_SCANS>
-            ScanPolicyT;
-    };
-
-    /// SM20
-    struct Policy200
-    {
-        // GTX 580: 20.3B items/s (162.3 GB/s) @ 48M 32-bit T
-        typedef AgentScanPolicy<
-                CUB_SCALED_GRANULARITIES(128, 12, OutputT),      ///< Threads per block, items per thread
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                BLOCK_STORE_WARP_TRANSPOSE,
-                BLOCK_SCAN_WARP_SCANS>
-            ScanPolicyT;
-    };
-
-    /// SM13
-    struct Policy130
-    {
-        typedef AgentScanPolicy<
-                CUB_SCALED_GRANULARITIES(96, 21, OutputT),      ///< Threads per block, items per thread
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                BLOCK_STORE_WARP_TRANSPOSE,
-                BLOCK_SCAN_RAKING_MEMOIZE>
-            ScanPolicyT;
-    };
-
-    /// SM10
-    struct Policy100
-    {
-        typedef AgentScanPolicy<
-                CUB_SCALED_GRANULARITIES(64, 9, OutputT),      ///< Threads per block, items per thread
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                BLOCK_STORE_WARP_TRANSPOSE,
-                BLOCK_SCAN_WARP_SCANS>
-            ScanPolicyT;
-    };
-
-
-    //---------------------------------------------------------------------
-    // Tuning policies of current PTX compiler pass
-    //---------------------------------------------------------------------
-
-#if (CUB_PTX_ARCH >= 600)
-    typedef Policy600 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 520)
-    typedef Policy520 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 350)
-    typedef Policy350 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 300)
-    typedef Policy300 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 200)
-    typedef Policy200 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 130)
-    typedef Policy130 PtxPolicy;
-
-#else
-    typedef Policy100 PtxPolicy;
-
-#endif
-
-    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
-    struct PtxAgentScanPolicy : PtxPolicy::ScanPolicyT {};
-
-
-    //---------------------------------------------------------------------
-    // Utilities
-    //---------------------------------------------------------------------
-
-    /**
-     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
-     */
-    template <typename KernelConfig>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static void InitConfigs(
-        int             ptx_version,
-        KernelConfig    &scan_kernel_config)
-    {
-    #if (CUB_PTX_ARCH > 0)
-        (void)ptx_version;
-
-        // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
-        scan_kernel_config.template Init<PtxAgentScanPolicy>();
-
-    #else
-
-        // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
-        if (ptx_version >= 600)
-        {
-            scan_kernel_config.template Init<typename Policy600::ScanPolicyT>();
-        }
-        else if (ptx_version >= 520)
-        {
-            scan_kernel_config.template Init<typename Policy520::ScanPolicyT>();
-        }
-        else if (ptx_version >= 350)
-        {
-            scan_kernel_config.template Init<typename Policy350::ScanPolicyT>();
-        }
-        else if (ptx_version >= 300)
-        {
-            scan_kernel_config.template Init<typename Policy300::ScanPolicyT>();
-        }
-        else if (ptx_version >= 200)
-        {
-            scan_kernel_config.template Init<typename Policy200::ScanPolicyT>();
-        }
-        else if (ptx_version >= 130)
-        {
-            scan_kernel_config.template Init<typename Policy130::ScanPolicyT>();
-        }
-        else
-        {
-            scan_kernel_config.template Init<typename Policy100::ScanPolicyT>();
-        }
-
-    #endif
-    }
-
-
-    /**
-     * Kernel kernel dispatch configuration.
-     */
-    struct KernelConfig
-    {
-        int block_threads;
-        int items_per_thread;
-        int tile_items;
-
-        template <typename PolicyT>
-        CUB_RUNTIME_FUNCTION __forceinline__
-        void Init()
-        {
-            block_threads       = PolicyT::BLOCK_THREADS;
-            items_per_thread    = PolicyT::ITEMS_PER_THREAD;
-            tile_items          = block_threads * items_per_thread;
-        }
-    };
-
-
-    //---------------------------------------------------------------------
-    // Dispatch entrypoints
-    //---------------------------------------------------------------------
-
-    /**
-     * Internal dispatch routine for computing a device-wide prefix scan using the
-     * specified kernel functions.
-     */
-    template <
-        typename            ScanInitKernelPtrT,     ///< Function type of cub::DeviceScanInitKernel
-        typename            ScanSweepKernelPtrT>    ///< Function type of cub::DeviceScanKernelPtrT
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Dispatch(
-        void*               d_temp_storage,         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&             temp_storage_bytes,     ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT      d_in,                   ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT     d_out,                  ///< [out] Pointer to the output sequence of data items
-        ScanOpT             scan_op,                ///< [in] Binary scan functor 
-        InitValueT          init_value,             ///< [in] Initial value to seed the exclusive scan
-        OffsetT             num_items,              ///< [in] Total number of input items (i.e., the length of \p d_in)
-        cudaStream_t        stream,                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous,      ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-        int                 /*ptx_version*/,        ///< [in] PTX version of dispatch kernels
-        ScanInitKernelPtrT  init_kernel,            ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel
-        ScanSweepKernelPtrT scan_kernel,            ///< [in] Kernel function pointer to parameterization of cub::DeviceScanKernel
-        KernelConfig        scan_kernel_config)     ///< [in] Dispatch parameters that match the policy that \p scan_kernel was compiled for
-    {
-
-#ifndef CUB_RUNTIME_ENABLED
-        (void)d_temp_storage;
-        (void)temp_storage_bytes;
-        (void)d_in;
-        (void)d_out;
-        (void)scan_op;
-        (void)init_value;
-        (void)num_items;
-        (void)stream;
-        (void)debug_synchronous;
-        (void)init_kernel;
-        (void)scan_kernel;
-        (void)scan_kernel_config;
-
-        // Kernel launch not supported from this device
-        return CubDebug(cudaErrorNotSupported);
-
-#else
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get device ordinal
-            int device_ordinal;
-            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
-
-            // Get SM count
-            int sm_count;
-            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
-
-            // Number of input tiles
-            int tile_size = scan_kernel_config.block_threads * scan_kernel_config.items_per_thread;
-            int num_tiles = (num_items + tile_size - 1) / tile_size;
-
-            // Specify temporary storage allocation requirements
-            size_t  allocation_sizes[1];
-            if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break;    // bytes needed for tile status descriptors
-
-            // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob)
-            void* allocations[1];
-            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
-            if (d_temp_storage == NULL)
-            {
-                // Return if the caller is simply requesting the size of the storage allocation
-                break;
-            }
-
-            // Return if empty problem
-            if (num_items == 0)
-                break;
-
-            // Construct the tile status interface
-            ScanTileStateT tile_state;
-            if (CubDebug(error = tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;
-
-            // Log init_kernel configuration
-            int init_grid_size = (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS;
-            if (debug_synchronous) _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
-
-            // Invoke init_kernel to initialize tile descriptors
-            init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>(
-                tile_state,
-                num_tiles);
-
-            // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
-
-            // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-
-            // Get SM occupancy for scan_kernel
-            int scan_sm_occupancy;
-            if (CubDebug(error = MaxSmOccupancy(
-                scan_sm_occupancy,            // out
-                scan_kernel,
-                scan_kernel_config.block_threads))) break;
-
-            // Get max x-dimension of grid
-            int max_dim_x;
-            if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;;
-
-            // Run grids in epochs (in case number of tiles exceeds max x-dimension
-            int scan_grid_size = CUB_MIN(num_tiles, max_dim_x);
-            for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size)
-            {
-                // Log scan_kernel configuration
-                if (debug_synchronous) _CubLog("Invoking %d scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
-                    start_tile, scan_grid_size, scan_kernel_config.block_threads, (long long) stream, scan_kernel_config.items_per_thread, scan_sm_occupancy);
-
-                // Invoke scan_kernel
-                scan_kernel<<<scan_grid_size, scan_kernel_config.block_threads, 0, stream>>>(
-                    d_in,
-                    d_out,
-                    tile_state,
-                    start_tile,
-                    scan_op,
-                    init_value,
-                    num_items);
-
-                // Check for failure to launch
-                if (CubDebug(error = cudaPeekAtLastError())) break;
-
-                // Sync the stream if specified to flush runtime errors
-                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-            }
-        }
-        while (0);
-
-        return error;
-
-#endif  // CUB_RUNTIME_ENABLED
-    }
-
-
-    /**
-     * Internal dispatch routine
-     */
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Dispatch(
-        void*           d_temp_storage,         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&         temp_storage_bytes,     ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT  d_in,                   ///< [in] Pointer to the input sequence of data items
-        OutputIteratorT d_out,                  ///< [out] Pointer to the output sequence of data items
-        ScanOpT         scan_op,                ///< [in] Binary scan functor 
-        InitValueT      init_value,             ///< [in] Initial value to seed the exclusive scan
-        OffsetT         num_items,              ///< [in] Total number of input items (i.e., the length of \p d_in)
-        cudaStream_t    stream,                 ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool            debug_synchronous)      ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get PTX version
-            int ptx_version;
-            if (CubDebug(error = PtxVersion(ptx_version))) break;
-
-            // Get kernel kernel dispatch configurations
-            KernelConfig scan_kernel_config;
-            InitConfigs(ptx_version, scan_kernel_config);
-
-            // Dispatch
-            if (CubDebug(error = Dispatch(
-                d_temp_storage,
-                temp_storage_bytes,
-                d_in,
-                d_out,
-                scan_op,
-                init_value,
-                num_items,
-                stream,
-                debug_synchronous,
-                ptx_version,
-                DeviceScanInitKernel<ScanTileStateT>,
-                DeviceScanKernel<PtxAgentScanPolicy, InputIteratorT, OutputIteratorT, ScanTileStateT, ScanOpT, InitValueT, OffsetT>,
-                scan_kernel_config))) break;
-        }
-        while (0);
-
-        return error;
-    }
-};
-
-
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/thrust/system/cuda/detail/cub/device/dispatch/dispatch_select_if.cuh b/thrust/system/cuda/detail/cub/device/dispatch/dispatch_select_if.cuh
deleted file mode 100644
index 1b3aa8dad..000000000
--- a/thrust/system/cuda/detail/cub/device/dispatch/dispatch_select_if.cuh
+++ /dev/null
@@ -1,542 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DeviceSelect provides device-wide, parallel operations for selecting items from sequences of data items residing within device-accessible memory.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-
-#include "dispatch_scan.cuh"
-#include "../../agent/agent_select_if.cuh"
-#include "../../thread/thread_operators.cuh"
-#include "../../grid/grid_queue.cuh"
-#include "../../util_device.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/******************************************************************************
- * Kernel entry points
- *****************************************************************************/
-
-/**
- * Select kernel entry point (multi-block)
- *
- * Performs functor-based selection if SelectOpT functor type != NullType
- * Otherwise performs flag-based selection if FlagsInputIterator's value type != NullType
- * Otherwise performs discontinuity selection (keep unique)
- */
-template <
-    typename            AgentSelectIfPolicyT,       ///< Parameterized AgentSelectIfPolicyT tuning policy type
-    typename            InputIteratorT,             ///< Random-access input iterator type for reading input items
-    typename            FlagsInputIteratorT,        ///< Random-access input iterator type for reading selection flags (NullType* if a selection functor or discontinuity flagging is to be used for selection)
-    typename            SelectedOutputIteratorT,    ///< Random-access output iterator type for writing selected items
-    typename            NumSelectedIteratorT,       ///< Output iterator type for recording the number of items selected
-    typename            ScanTileStateT,             ///< Tile status interface type
-    typename            SelectOpT,                  ///< Selection operator type (NullType if selection flags or discontinuity flagging is to be used for selection)
-    typename            EqualityOpT,                ///< Equality operator type (NullType if selection functor or selection flags is to be used for selection)
-    typename            OffsetT,                    ///< Signed integer type for global offsets
-    bool                KEEP_REJECTS>               ///< Whether or not we push rejected items to the back of the output
-__launch_bounds__ (int(AgentSelectIfPolicyT::BLOCK_THREADS))
-__global__ void DeviceSelectSweepKernel(
-    InputIteratorT          d_in,                   ///< [in] Pointer to the input sequence of data items
-    FlagsInputIteratorT     d_flags,                ///< [in] Pointer to the input sequence of selection flags (if applicable)
-    SelectedOutputIteratorT d_selected_out,         ///< [out] Pointer to the output sequence of selected data items
-    NumSelectedIteratorT    d_num_selected_out,     ///< [out] Pointer to the total number of items selected (i.e., length of \p d_selected_out)
-    ScanTileStateT          tile_status,            ///< [in] Tile status interface
-    SelectOpT               select_op,              ///< [in] Selection operator
-    EqualityOpT             equality_op,            ///< [in] Equality operator
-    OffsetT                 num_items,              ///< [in] Total number of input items (i.e., length of \p d_in)
-    int                     num_tiles)              ///< [in] Total number of tiles for the entire problem
-{
-    // Thread block type for selecting data from input tiles
-    typedef AgentSelectIf<
-        AgentSelectIfPolicyT,
-        InputIteratorT,
-        FlagsInputIteratorT,
-        SelectedOutputIteratorT,
-        SelectOpT,
-        EqualityOpT,
-        OffsetT,
-        KEEP_REJECTS> AgentSelectIfT;
-
-    // Shared memory for AgentSelectIf
-    __shared__ typename AgentSelectIfT::TempStorage temp_storage;
-
-    // Process tiles
-    AgentSelectIfT(temp_storage, d_in, d_flags, d_selected_out, select_op, equality_op, num_items).ConsumeRange(
-        num_tiles,
-        tile_status,
-        d_num_selected_out);
-}
-
-
-
-
-/******************************************************************************
- * Dispatch
- ******************************************************************************/
-
-/**
- * Utility class for dispatching the appropriately-tuned kernels for DeviceSelect
- */
-template <
-    typename    InputIteratorT,                 ///< Random-access input iterator type for reading input items
-    typename    FlagsInputIteratorT,            ///< Random-access input iterator type for reading selection flags (NullType* if a selection functor or discontinuity flagging is to be used for selection)
-    typename    SelectedOutputIteratorT,        ///< Random-access output iterator type for writing selected items
-    typename    NumSelectedIteratorT,           ///< Output iterator type for recording the number of items selected
-    typename    SelectOpT,                      ///< Selection operator type (NullType if selection flags or discontinuity flagging is to be used for selection)
-    typename    EqualityOpT,                    ///< Equality operator type (NullType if selection functor or selection flags is to be used for selection)
-    typename    OffsetT,                        ///< Signed integer type for global offsets
-    bool        KEEP_REJECTS>                   ///< Whether or not we push rejected items to the back of the output
-struct DispatchSelectIf
-{
-    /******************************************************************************
-     * Types and constants
-     ******************************************************************************/
-
-    // The output value type
-    typedef typename If<(Equals<typename std::iterator_traits<SelectedOutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
-        typename std::iterator_traits<InputIteratorT>::value_type,                                                  // ... then the input iterator's value type,
-        typename std::iterator_traits<SelectedOutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
-
-    // The flag value type
-    typedef typename std::iterator_traits<FlagsInputIteratorT>::value_type FlagT;
-
-    enum
-    {
-        INIT_KERNEL_THREADS = 128,
-    };
-
-    // Tile status descriptor interface type
-    typedef ScanTileState<OffsetT> ScanTileStateT;
-
-
-    /******************************************************************************
-     * Tuning policies
-     ******************************************************************************/
-
-    /// SM35
-    struct Policy350
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 10,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))),
-        };
-
-        typedef AgentSelectIfPolicy<
-                128,
-                ITEMS_PER_THREAD,
-                BLOCK_LOAD_DIRECT,
-                LOAD_LDG,
-                BLOCK_SCAN_WARP_SCANS>
-            SelectIfPolicyT;
-    };
-
-    /// SM30
-    struct Policy300
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 7,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(3, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))),
-        };
-
-        typedef AgentSelectIfPolicy<
-                128,
-                ITEMS_PER_THREAD,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                BLOCK_SCAN_WARP_SCANS>
-            SelectIfPolicyT;
-    };
-
-    /// SM20
-    struct Policy200
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = (KEEP_REJECTS) ? 7 : 15,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))),
-        };
-
-        typedef AgentSelectIfPolicy<
-                128,
-                ITEMS_PER_THREAD,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                BLOCK_SCAN_WARP_SCANS>
-            SelectIfPolicyT;
-    };
-
-    /// SM13
-    struct Policy130
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 9,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))),
-        };
-
-        typedef AgentSelectIfPolicy<
-                64,
-                ITEMS_PER_THREAD,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                BLOCK_SCAN_RAKING_MEMOIZE>
-            SelectIfPolicyT;
-    };
-
-    /// SM10
-    struct Policy100
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 9,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))),
-        };
-
-        typedef AgentSelectIfPolicy<
-                64,
-                ITEMS_PER_THREAD,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                BLOCK_SCAN_RAKING>
-            SelectIfPolicyT;
-    };
-
-
-    /******************************************************************************
-     * Tuning policies of current PTX compiler pass
-     ******************************************************************************/
-
-#if (CUB_PTX_ARCH >= 350)
-    typedef Policy350 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 300)
-    typedef Policy300 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 200)
-    typedef Policy200 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 130)
-    typedef Policy130 PtxPolicy;
-
-#else
-    typedef Policy100 PtxPolicy;
-
-#endif
-
-    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
-    struct PtxSelectIfPolicyT : PtxPolicy::SelectIfPolicyT {};
-
-
-    /******************************************************************************
-     * Utilities
-     ******************************************************************************/
-
-    /**
-     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
-     */
-    template <typename KernelConfig>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static void InitConfigs(
-        int             ptx_version,
-        KernelConfig    &select_if_config)
-    {
-    #if (CUB_PTX_ARCH > 0)
-        (void)ptx_version;
-
-        // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
-        select_if_config.template Init<PtxSelectIfPolicyT>();
-
-    #else
-
-        // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
-        if (ptx_version >= 350)
-        {
-            select_if_config.template Init<typename Policy350::SelectIfPolicyT>();
-        }
-        else if (ptx_version >= 300)
-        {
-            select_if_config.template Init<typename Policy300::SelectIfPolicyT>();
-        }
-        else if (ptx_version >= 200)
-        {
-            select_if_config.template Init<typename Policy200::SelectIfPolicyT>();
-        }
-        else if (ptx_version >= 130)
-        {
-            select_if_config.template Init<typename Policy130::SelectIfPolicyT>();
-        }
-        else
-        {
-            select_if_config.template Init<typename Policy100::SelectIfPolicyT>();
-        }
-
-    #endif
-    }
-
-
-    /**
-     * Kernel kernel dispatch configuration.
-     */
-    struct KernelConfig
-    {
-        int block_threads;
-        int items_per_thread;
-        int tile_items;
-
-        template <typename PolicyT>
-        CUB_RUNTIME_FUNCTION __forceinline__
-        void Init()
-        {
-            block_threads       = PolicyT::BLOCK_THREADS;
-            items_per_thread    = PolicyT::ITEMS_PER_THREAD;
-            tile_items          = block_threads * items_per_thread;
-        }
-    };
-
-
-    /******************************************************************************
-     * Dispatch entrypoints
-     ******************************************************************************/
-
-    /**
-     * Internal dispatch routine for computing a device-wide selection using the
-     * specified kernel functions.
-     */
-    template <
-        typename                    ScanInitKernelPtrT,             ///< Function type of cub::DeviceScanInitKernel
-        typename                    SelectIfKernelPtrT>             ///< Function type of cub::SelectIfKernelPtrT
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Dispatch(
-        void*                       d_temp_storage,                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&                     temp_storage_bytes,             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
-        FlagsInputIteratorT         d_flags,                        ///< [in] Pointer to the input sequence of selection flags (if applicable)
-        SelectedOutputIteratorT     d_selected_out,                 ///< [in] Pointer to the output sequence of selected data items
-        NumSelectedIteratorT        d_num_selected_out,             ///< [in] Pointer to the total number of items selected (i.e., length of \p d_selected_out)
-        SelectOpT                   select_op,                      ///< [in] Selection operator
-        EqualityOpT                 equality_op,                    ///< [in] Equality operator
-        OffsetT                     num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
-        cudaStream_t                stream,                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous,              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-        int                         /*ptx_version*/,                ///< [in] PTX version of dispatch kernels
-        ScanInitKernelPtrT          scan_init_kernel,               ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel
-        SelectIfKernelPtrT          select_if_kernel,               ///< [in] Kernel function pointer to parameterization of cub::DeviceSelectSweepKernel
-        KernelConfig                select_if_config)               ///< [in] Dispatch parameters that match the policy that \p select_if_kernel was compiled for
-    {
-
-#ifndef CUB_RUNTIME_ENABLED
-        (void)d_temp_storage;
-        (void)temp_storage_bytes;
-        (void)d_in;
-        (void)d_flags;
-        (void)d_selected_out;
-        (void)d_num_selected_out;
-        (void)select_op;
-        (void)equality_op;
-        (void)num_items;
-        (void)stream;
-        (void)debug_synchronous;
-        (void)scan_init_kernel;
-        (void)select_if_kernel;
-        (void)select_if_config;
-
-        // Kernel launch not supported from this device
-        return CubDebug(cudaErrorNotSupported);
-
-#else
-
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get device ordinal
-            int device_ordinal;
-            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
-
-            // Get SM count
-            int sm_count;
-            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
-
-            // Number of input tiles
-            int tile_size = select_if_config.block_threads * select_if_config.items_per_thread;
-            int num_tiles = (num_items + tile_size - 1) / tile_size;
-
-            // Specify temporary storage allocation requirements
-            size_t  allocation_sizes[1];
-            if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break;    // bytes needed for tile status descriptors
-
-            // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob)
-            void* allocations[1];
-            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
-            if (d_temp_storage == NULL)
-            {
-                // Return if the caller is simply requesting the size of the storage allocation
-                break;
-            }
-
-            // Construct the tile status interface
-            ScanTileStateT tile_status;
-            if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;
-
-            // Log scan_init_kernel configuration
-            int init_grid_size = CUB_MAX(1, (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS);
-            if (debug_synchronous) _CubLog("Invoking scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
-
-            // Invoke scan_init_kernel to initialize tile descriptors
-            scan_init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>(
-                tile_status,
-                num_tiles,
-                d_num_selected_out);
-
-            // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
-
-            // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-
-            // Return if empty problem
-            if (num_items == 0)
-                break;
-
-            // Get SM occupancy for select_if_kernel
-            int range_select_sm_occupancy;
-            if (CubDebug(error = MaxSmOccupancy(
-                range_select_sm_occupancy,            // out
-                select_if_kernel,
-                select_if_config.block_threads))) break;
-
-            // Get max x-dimension of grid
-            int max_dim_x;
-            if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;;
-
-            // Get grid size for scanning tiles
-            dim3 scan_grid_size;
-            scan_grid_size.z = 1;
-            scan_grid_size.y = ((unsigned int) num_tiles + max_dim_x - 1) / max_dim_x;
-            scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x);
-
-            // Log select_if_kernel configuration
-            if (debug_synchronous) _CubLog("Invoking select_if_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
-                scan_grid_size.x, scan_grid_size.y, scan_grid_size.z, select_if_config.block_threads, (long long) stream, select_if_config.items_per_thread, range_select_sm_occupancy);
-
-            // Invoke select_if_kernel
-            select_if_kernel<<<scan_grid_size, select_if_config.block_threads, 0, stream>>>(
-                d_in,
-                d_flags,
-                d_selected_out,
-                d_num_selected_out,
-                tile_status,
-                select_op,
-                equality_op,
-                num_items,
-                num_tiles);
-
-            // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
-
-            // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-        }
-        while (0);
-
-        return error;
-
-#endif  // CUB_RUNTIME_ENABLED
-    }
-
-
-    /**
-     * Internal dispatch routine
-     */
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Dispatch(
-        void*                       d_temp_storage,                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&                     temp_storage_bytes,             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
-        FlagsInputIteratorT         d_flags,                        ///< [in] Pointer to the input sequence of selection flags (if applicable)
-        SelectedOutputIteratorT     d_selected_out,                 ///< [in] Pointer to the output sequence of selected data items
-        NumSelectedIteratorT        d_num_selected_out,             ///< [in] Pointer to the total number of items selected (i.e., length of \p d_selected_out)
-        SelectOpT                   select_op,                      ///< [in] Selection operator
-        EqualityOpT                 equality_op,                    ///< [in] Equality operator
-        OffsetT                     num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
-        cudaStream_t                stream,                         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous)              ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get PTX version
-            int ptx_version;
-    #if (CUB_PTX_ARCH == 0)
-            if (CubDebug(error = PtxVersion(ptx_version))) break;
-    #else
-            ptx_version = CUB_PTX_ARCH;
-    #endif
-
-            // Get kernel kernel dispatch configurations
-            KernelConfig select_if_config;
-            InitConfigs(ptx_version, select_if_config);
-
-            // Dispatch
-            if (CubDebug(error = Dispatch(
-                d_temp_storage,
-                temp_storage_bytes,
-                d_in,
-                d_flags,
-                d_selected_out,
-                d_num_selected_out,
-                select_op,
-                equality_op,
-                num_items,
-                stream,
-                debug_synchronous,
-                ptx_version,
-                DeviceCompactInitKernel<ScanTileStateT, NumSelectedIteratorT>,
-                DeviceSelectSweepKernel<PtxSelectIfPolicyT, InputIteratorT, FlagsInputIteratorT, SelectedOutputIteratorT, NumSelectedIteratorT, ScanTileStateT, SelectOpT, EqualityOpT, OffsetT, KEEP_REJECTS>,
-                select_if_config))) break;
-        }
-        while (0);
-
-        return error;
-    }
-};
-
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_orig.cuh b/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_orig.cuh
deleted file mode 100644
index a0bf515c1..000000000
--- a/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_orig.cuh
+++ /dev/null
@@ -1,834 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * vector multiplication (SpMV).
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-
-#include "../../agent/single_pass_scan_operators.cuh"
-#include "../../agent/agent_segment_fixup.cuh"
-#include "../../agent/agent_spmv_orig.cuh"
-#include "../../util_type.cuh"
-#include "../../util_debug.cuh"
-#include "../../util_device.cuh"
-#include "../../thread/thread_search.cuh"
-#include "../../grid/grid_queue.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * SpMV kernel entry points
- *****************************************************************************/
-
-/**
- * Spmv search kernel. Identifies merge path starting coordinates for each tile.
- */
-template <
-    typename    AgentSpmvPolicyT,           ///< Parameterized SpmvPolicy tuning policy type
-    typename    ValueT,                     ///< Matrix and vector value type
-    typename    OffsetT>                    ///< Signed integer type for sequence offsets
-__global__ void DeviceSpmv1ColKernel(
-    SpmvParams<ValueT, OffsetT> spmv_params)                ///< [in] SpMV input parameter bundle
-{
-    typedef CacheModifiedInputIterator<
-            AgentSpmvPolicyT::VECTOR_VALUES_LOAD_MODIFIER,
-            ValueT,
-            OffsetT>
-        VectorValueIteratorT;
-
-    VectorValueIteratorT wrapped_vector_x(spmv_params.d_vector_x);
-
-    int row_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
-    if (row_idx < spmv_params.num_rows)
-    {
-        OffsetT     end_nonzero_idx = spmv_params.d_row_end_offsets[row_idx];
-        OffsetT     nonzero_idx = spmv_params.d_row_end_offsets[row_idx - 1];
-
-        ValueT value = 0.0;
-        if (end_nonzero_idx != nonzero_idx)
-        {
-            value = spmv_params.d_values[nonzero_idx] * wrapped_vector_x[spmv_params.d_column_indices[nonzero_idx]];
-        }
-
-        spmv_params.d_vector_y[row_idx] = value;
-    }
-}
-
-
-/**
- * Spmv search kernel. Identifies merge path starting coordinates for each tile.
- */
-template <
-    typename    SpmvPolicyT,                    ///< Parameterized SpmvPolicy tuning policy type
-    typename    OffsetT,                        ///< Signed integer type for sequence offsets
-    typename    CoordinateT,                    ///< Merge path coordinate type
-    typename    SpmvParamsT>                    ///< SpmvParams type
-__global__ void DeviceSpmvSearchKernel(
-    int             num_merge_tiles,            ///< [in] Number of SpMV merge tiles (spmv grid size)
-    CoordinateT*    d_tile_coordinates,         ///< [out] Pointer to the temporary array of tile starting coordinates
-    SpmvParamsT     spmv_params)                ///< [in] SpMV input parameter bundle
-{
-    /// Constants
-    enum
-    {
-        BLOCK_THREADS           = SpmvPolicyT::BLOCK_THREADS,
-        ITEMS_PER_THREAD        = SpmvPolicyT::ITEMS_PER_THREAD,
-        TILE_ITEMS              = BLOCK_THREADS * ITEMS_PER_THREAD,
-    };
-
-    typedef CacheModifiedInputIterator<
-            SpmvPolicyT::ROW_OFFSETS_SEARCH_LOAD_MODIFIER,
-            OffsetT,
-            OffsetT>
-        RowOffsetsSearchIteratorT;
-
-    // Find the starting coordinate for all tiles (plus the end coordinate of the last one)
-    int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
-    if (tile_idx < num_merge_tiles + 1)
-    {
-        OffsetT                         diagonal = (tile_idx * TILE_ITEMS);
-        CoordinateT                     tile_coordinate;
-        CountingInputIterator<OffsetT>  nonzero_indices(0);
-
-        // Search the merge path
-        MergePathSearch(
-            diagonal,
-            RowOffsetsSearchIteratorT(spmv_params.d_row_end_offsets),
-            nonzero_indices,
-            spmv_params.num_rows,
-            spmv_params.num_nonzeros,
-            tile_coordinate);
-
-        // Output starting offset
-        d_tile_coordinates[tile_idx] = tile_coordinate;
-    }
-}
-
-
-/**
- * Spmv agent entry point
- */
-template <
-    typename        SpmvPolicyT,                ///< Parameterized SpmvPolicy tuning policy type
-    typename        ScanTileStateT,             ///< Tile status interface type
-    typename        ValueT,                     ///< Matrix and vector value type
-    typename        OffsetT,                    ///< Signed integer type for sequence offsets
-    typename        CoordinateT,                ///< Merge path coordinate type
-    bool            HAS_ALPHA,                  ///< Whether the input parameter Alpha is 1
-    bool            HAS_BETA>                   ///< Whether the input parameter Beta is 0
-__launch_bounds__ (int(SpmvPolicyT::BLOCK_THREADS))
-__global__ void DeviceSpmvKernel(
-    SpmvParams<ValueT, OffsetT>     spmv_params,                ///< [in] SpMV input parameter bundle
-    CoordinateT*                    d_tile_coordinates,         ///< [in] Pointer to the temporary array of tile starting coordinates
-    KeyValuePair<OffsetT,ValueT>*   d_tile_carry_pairs,         ///< [out] Pointer to the temporary array carry-out dot product row-ids, one per block
-    int                             num_tiles,                  ///< [in] Number of merge tiles
-    ScanTileStateT                  tile_state,                 ///< [in] Tile status interface for fixup reduce-by-key kernel
-    int                             num_segment_fixup_tiles)    ///< [in] Number of reduce-by-key tiles (fixup grid size)
-{
-    // Spmv agent type specialization
-    typedef AgentSpmv<
-            SpmvPolicyT,
-            ValueT,
-            OffsetT,
-            HAS_ALPHA,
-            HAS_BETA>
-        AgentSpmvT;
-
-    // Shared memory for AgentSpmv
-    __shared__ typename AgentSpmvT::TempStorage temp_storage;
-
-    AgentSpmvT(temp_storage, spmv_params).ConsumeTile(
-        d_tile_coordinates,
-        d_tile_carry_pairs,
-        num_tiles);
-
-    // Initialize fixup tile status
-    tile_state.InitializeStatus(num_segment_fixup_tiles);
-
-}
-
-
-/**
- * Multi-block reduce-by-key sweep kernel entry point
- */
-template <
-    typename    AgentSegmentFixupPolicyT,       ///< Parameterized AgentSegmentFixupPolicy tuning policy type
-    typename    PairsInputIteratorT,            ///< Random-access input iterator type for keys
-    typename    AggregatesOutputIteratorT,      ///< Random-access output iterator type for values
-    typename    OffsetT,                        ///< Signed integer type for global offsets
-    typename    ScanTileStateT>                 ///< Tile status interface type
-__launch_bounds__ (int(AgentSegmentFixupPolicyT::BLOCK_THREADS))
-__global__ void DeviceSegmentFixupKernel(
-    PairsInputIteratorT         d_pairs_in,         ///< [in] Pointer to the array carry-out dot product row-ids, one per spmv block
-    AggregatesOutputIteratorT   d_aggregates_out,   ///< [in,out] Output value aggregates
-    OffsetT                     num_items,          ///< [in] Total number of items to select from
-    int                         num_tiles,          ///< [in] Total number of tiles for the entire problem
-    ScanTileStateT              tile_state)         ///< [in] Tile status interface
-{
-    // Thread block type for reducing tiles of value segments
-    typedef AgentSegmentFixup<
-            AgentSegmentFixupPolicyT,
-            PairsInputIteratorT,
-            AggregatesOutputIteratorT,
-            cub::Equality,
-            cub::Sum,
-            OffsetT>
-        AgentSegmentFixupT;
-
-    // Shared memory for AgentSegmentFixup
-    __shared__ typename AgentSegmentFixupT::TempStorage temp_storage;
-
-    // Process tiles
-    AgentSegmentFixupT(temp_storage, d_pairs_in, d_aggregates_out, cub::Equality(), cub::Sum()).ConsumeRange(
-        num_items,
-        num_tiles,
-        tile_state);
-}
-
-
-/******************************************************************************
- * Dispatch
- ******************************************************************************/
-
-/**
- * Utility class for dispatching the appropriately-tuned kernels for DeviceSpmv
- */
-template <
-    typename    ValueT,                     ///< Matrix and vector value type
-    typename    OffsetT>                    ///< Signed integer type for global offsets
-struct DispatchSpmv
-{
-    //---------------------------------------------------------------------
-    // Constants and Types
-    //---------------------------------------------------------------------
-
-    enum
-    {
-        INIT_KERNEL_THREADS = 128
-    };
-
-    // SpmvParams bundle type
-    typedef SpmvParams<ValueT, OffsetT> SpmvParamsT;
-
-    // 2D merge path coordinate type
-    typedef typename CubVector<OffsetT, 2>::Type CoordinateT;
-
-    // Tile status descriptor interface type
-    typedef ReduceByKeyScanTileState<ValueT, OffsetT> ScanTileStateT;
-
-    // Tuple type for scanning (pairs accumulated segment-value with segment-index)
-    typedef KeyValuePair<OffsetT, ValueT> KeyValuePairT;
-
-
-    //---------------------------------------------------------------------
-    // Tuning policies
-    //---------------------------------------------------------------------
-
-    /// SM11
-    struct Policy110
-    {
-        typedef AgentSpmvPolicy<
-                128,
-                1,
-                LOAD_DEFAULT,
-                LOAD_DEFAULT,
-                LOAD_DEFAULT,
-                LOAD_DEFAULT,
-                LOAD_DEFAULT,
-                false,
-                BLOCK_SCAN_WARP_SCANS>
-            SpmvPolicyT;
-
-        typedef AgentSegmentFixupPolicy<
-                128,
-                4,
-                BLOCK_LOAD_VECTORIZE,
-                LOAD_DEFAULT,
-                BLOCK_SCAN_WARP_SCANS>
-            SegmentFixupPolicyT;
-    };
-
-    /// SM20
-    struct Policy200 
-    {
-        typedef AgentSpmvPolicy<
-                96,
-                18,
-                LOAD_DEFAULT,
-                LOAD_DEFAULT,
-                LOAD_DEFAULT,
-                LOAD_DEFAULT,
-                LOAD_DEFAULT,
-                false,
-                BLOCK_SCAN_RAKING>
-            SpmvPolicyT;
-
-        typedef AgentSegmentFixupPolicy<
-                128,
-                4,
-                BLOCK_LOAD_VECTORIZE,
-                LOAD_DEFAULT,
-                BLOCK_SCAN_WARP_SCANS>
-            SegmentFixupPolicyT;
-
-    };
-
-
-
-    /// SM30
-    struct Policy300 
-    {
-        typedef AgentSpmvPolicy<
-                96,
-                6,
-                LOAD_DEFAULT,
-                LOAD_DEFAULT,
-                LOAD_DEFAULT,
-                LOAD_DEFAULT,
-                LOAD_DEFAULT,
-                false,
-                BLOCK_SCAN_WARP_SCANS>
-            SpmvPolicyT;
-
-        typedef AgentSegmentFixupPolicy<
-                128,
-                4,
-                BLOCK_LOAD_VECTORIZE,
-                LOAD_DEFAULT,
-                BLOCK_SCAN_WARP_SCANS>
-            SegmentFixupPolicyT;
-
-    };
-
-
-    /// SM35
-    struct Policy350
-    {
-        typedef AgentSpmvPolicy<
-                (sizeof(ValueT) > 4) ? 96 : 128,
-                (sizeof(ValueT) > 4) ? 4 : 7,
-                LOAD_LDG,
-                LOAD_CA,
-                LOAD_LDG,
-                LOAD_LDG,
-                LOAD_LDG,
-                (sizeof(ValueT) > 4) ? true : false,
-                BLOCK_SCAN_WARP_SCANS>
-            SpmvPolicyT;
-
-        typedef AgentSegmentFixupPolicy<
-                128,
-                3,
-                BLOCK_LOAD_VECTORIZE,
-                LOAD_LDG,
-                BLOCK_SCAN_WARP_SCANS>
-            SegmentFixupPolicyT;
-    };
-
-
-    /// SM37
-    struct Policy370
-    {
-
-        typedef AgentSpmvPolicy<
-                (sizeof(ValueT) > 4) ? 128 : 128,
-                (sizeof(ValueT) > 4) ? 9 : 14,
-                LOAD_LDG,
-                LOAD_CA,
-                LOAD_LDG,
-                LOAD_LDG,
-                LOAD_LDG,
-                false, 
-                BLOCK_SCAN_WARP_SCANS>
-            SpmvPolicyT;
-
-        typedef AgentSegmentFixupPolicy<
-                128,
-                3,
-                BLOCK_LOAD_VECTORIZE,
-                LOAD_LDG,
-                BLOCK_SCAN_WARP_SCANS>
-            SegmentFixupPolicyT;
-    };
-
-    /// SM50
-    struct Policy500
-    {
-        typedef AgentSpmvPolicy<
-                (sizeof(ValueT) > 4) ? 64 : 128,
-                (sizeof(ValueT) > 4) ? 6 : 7,
-                LOAD_LDG,
-                LOAD_DEFAULT,
-                (sizeof(ValueT) > 4) ? LOAD_LDG : LOAD_DEFAULT,
-                (sizeof(ValueT) > 4) ? LOAD_LDG : LOAD_DEFAULT,
-                LOAD_LDG,
-                (sizeof(ValueT) > 4) ? true : false,
-                (sizeof(ValueT) > 4) ? BLOCK_SCAN_WARP_SCANS : BLOCK_SCAN_RAKING_MEMOIZE>
-            SpmvPolicyT;
-
-
-        typedef AgentSegmentFixupPolicy<
-                128,
-                3,
-                BLOCK_LOAD_VECTORIZE,
-                LOAD_LDG,
-                BLOCK_SCAN_RAKING_MEMOIZE>
-            SegmentFixupPolicyT;
-    };
-
-
-    /// SM60
-    struct Policy600
-    {
-        typedef AgentSpmvPolicy<
-                (sizeof(ValueT) > 4) ? 64 : 128,
-                (sizeof(ValueT) > 4) ? 5 : 7,
-                LOAD_DEFAULT,
-                LOAD_DEFAULT,
-                LOAD_DEFAULT,
-                LOAD_DEFAULT,
-                LOAD_DEFAULT,
-                false,
-                BLOCK_SCAN_WARP_SCANS>
-            SpmvPolicyT;
-
-
-        typedef AgentSegmentFixupPolicy<
-                128,
-                3,
-                BLOCK_LOAD_DIRECT,
-                LOAD_LDG,
-                BLOCK_SCAN_WARP_SCANS>
-            SegmentFixupPolicyT;
-    };
-
-
-
-    //---------------------------------------------------------------------
-    // Tuning policies of current PTX compiler pass
-    //---------------------------------------------------------------------
-
-#if (CUB_PTX_ARCH >= 600)
-    typedef Policy600 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 500)
-    typedef Policy500 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 370)
-    typedef Policy370 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 350)
-    typedef Policy350 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 300)
-    typedef Policy300 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 200)
-    typedef Policy200 PtxPolicy;
-
-#else
-    typedef Policy110 PtxPolicy;
-
-#endif
-
-    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
-    struct PtxSpmvPolicyT : PtxPolicy::SpmvPolicyT {};
-    struct PtxSegmentFixupPolicy : PtxPolicy::SegmentFixupPolicyT {};
-
-
-    //---------------------------------------------------------------------
-    // Utilities
-    //---------------------------------------------------------------------
-
-    /**
-     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
-     */
-    template <typename KernelConfig>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static void InitConfigs(
-        int             ptx_version,
-        KernelConfig    &spmv_config,
-        KernelConfig    &segment_fixup_config)
-    {
-    #if (CUB_PTX_ARCH > 0)
-
-        // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
-        spmv_config.template Init<PtxSpmvPolicyT>();
-        segment_fixup_config.template Init<PtxSegmentFixupPolicy>();
-
-    #else
-
-        // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
-        if (ptx_version >= 600)
-        {
-            spmv_config.template            Init<typename Policy600::SpmvPolicyT>();
-            segment_fixup_config.template   Init<typename Policy600::SegmentFixupPolicyT>();
-        }
-        else if (ptx_version >= 500)
-        {
-            spmv_config.template            Init<typename Policy500::SpmvPolicyT>();
-            segment_fixup_config.template   Init<typename Policy500::SegmentFixupPolicyT>();
-        }
-        else if (ptx_version >= 370)
-        {
-            spmv_config.template            Init<typename Policy370::SpmvPolicyT>();
-            segment_fixup_config.template   Init<typename Policy370::SegmentFixupPolicyT>();
-        }
-        else if (ptx_version >= 350)
-        {
-            spmv_config.template            Init<typename Policy350::SpmvPolicyT>();
-            segment_fixup_config.template   Init<typename Policy350::SegmentFixupPolicyT>();
-        }
-        else if (ptx_version >= 300)
-        {
-            spmv_config.template            Init<typename Policy300::SpmvPolicyT>();
-            segment_fixup_config.template   Init<typename Policy300::SegmentFixupPolicyT>();
-
-        }
-        else if (ptx_version >= 200)
-        {
-            spmv_config.template            Init<typename Policy200::SpmvPolicyT>();
-            segment_fixup_config.template   Init<typename Policy200::SegmentFixupPolicyT>();
-        }
-        else
-        {
-            spmv_config.template            Init<typename Policy110::SpmvPolicyT>();
-            segment_fixup_config.template   Init<typename Policy110::SegmentFixupPolicyT>();
-        }
-
-    #endif
-    }
-
-
-    /**
-     * Kernel kernel dispatch configuration.
-     */
-    struct KernelConfig
-    {
-        int block_threads;
-        int items_per_thread;
-        int tile_items;
-
-        template <typename PolicyT>
-        CUB_RUNTIME_FUNCTION __forceinline__
-        void Init()
-        {
-            block_threads       = PolicyT::BLOCK_THREADS;
-            items_per_thread    = PolicyT::ITEMS_PER_THREAD;
-            tile_items          = block_threads * items_per_thread;
-        }
-    };
-
-
-    //---------------------------------------------------------------------
-    // Dispatch entrypoints
-    //---------------------------------------------------------------------
-
-    /**
-     * Internal dispatch routine for computing a device-wide reduction using the
-     * specified kernel functions.
-     *
-     * If the input is larger than a single tile, this method uses two-passes of
-     * kernel invocations.
-     */
-    template <
-        typename                Spmv1ColKernelT,                    ///< Function type of cub::DeviceSpmv1ColKernel
-        typename                SpmvSearchKernelT,                  ///< Function type of cub::AgentSpmvSearchKernel
-        typename                SpmvKernelT,                        ///< Function type of cub::AgentSpmvKernel
-        typename                SegmentFixupKernelT>                 ///< Function type of cub::DeviceSegmentFixupKernelT
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Dispatch(
-        void*                   d_temp_storage,                     ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&                 temp_storage_bytes,                 ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        SpmvParamsT&            spmv_params,                        ///< SpMV input parameter bundle
-        cudaStream_t            stream,                             ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                    debug_synchronous,                  ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-        Spmv1ColKernelT         spmv_1col_kernel,                   ///< [in] Kernel function pointer to parameterization of DeviceSpmv1ColKernel
-        SpmvSearchKernelT       spmv_search_kernel,                 ///< [in] Kernel function pointer to parameterization of AgentSpmvSearchKernel
-        SpmvKernelT             spmv_kernel,                        ///< [in] Kernel function pointer to parameterization of AgentSpmvKernel
-        SegmentFixupKernelT     segment_fixup_kernel,               ///< [in] Kernel function pointer to parameterization of cub::DeviceSegmentFixupKernel
-        KernelConfig            spmv_config,                        ///< [in] Dispatch parameters that match the policy that \p spmv_kernel was compiled for
-        KernelConfig            segment_fixup_config)               ///< [in] Dispatch parameters that match the policy that \p segment_fixup_kernel was compiled for
-    {
-#ifndef CUB_RUNTIME_ENABLED
-
-        // Kernel launch not supported from this device
-        return CubDebug(cudaErrorNotSupported );
-
-#else
-        cudaError error = cudaSuccess;
-        do
-        {
-            if (spmv_params.num_cols == 1)
-            {
-                if (d_temp_storage == NULL)
-                {
-                    // Return if the caller is simply requesting the size of the storage allocation
-                    temp_storage_bytes = 1;
-                    break;
-                }
-
-                // Get search/init grid dims
-                int degen_col_kernel_block_size     = INIT_KERNEL_THREADS;
-                int degen_col_kernel_grid_size      = (spmv_params.num_rows + degen_col_kernel_block_size - 1) / degen_col_kernel_block_size;
-
-                if (debug_synchronous) _CubLog("Invoking spmv_1col_kernel<<<%d, %d, 0, %lld>>>()\n",
-                    degen_col_kernel_grid_size, degen_col_kernel_block_size, (long long) stream);
-
-                // Invoke spmv_search_kernel
-                spmv_1col_kernel<<<degen_col_kernel_grid_size, degen_col_kernel_block_size, 0, stream>>>(
-                    spmv_params);
-
-                // Check for failure to launch
-                if (CubDebug(error = cudaPeekAtLastError())) break;
-
-                // Sync the stream if specified to flush runtime errors
-                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-
-                break;
-            }
-
-            // Get device ordinal
-            int device_ordinal;
-            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
-
-            // Get SM count
-            int sm_count;
-            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
-
-            // Get max x-dimension of grid
-            int max_dim_x;
-            if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;;
-
-            // Total number of spmv work items
-            int num_merge_items = spmv_params.num_rows + spmv_params.num_nonzeros;
-
-            // Tile sizes of kernels
-            int merge_tile_size              = spmv_config.block_threads * spmv_config.items_per_thread;
-            int segment_fixup_tile_size     = segment_fixup_config.block_threads * segment_fixup_config.items_per_thread;
-
-            // Number of tiles for kernels
-            unsigned int num_merge_tiles            = (num_merge_items + merge_tile_size - 1) / merge_tile_size;
-            unsigned int num_segment_fixup_tiles    = (num_merge_tiles + segment_fixup_tile_size - 1) / segment_fixup_tile_size;
-
-            // Get SM occupancy for kernels
-            int spmv_sm_occupancy;
-            if (CubDebug(error = MaxSmOccupancy(
-                spmv_sm_occupancy,
-                spmv_kernel,
-                spmv_config.block_threads))) break;
-
-            int segment_fixup_sm_occupancy;
-            if (CubDebug(error = MaxSmOccupancy(
-                segment_fixup_sm_occupancy,
-                segment_fixup_kernel,
-                segment_fixup_config.block_threads))) break;
-
-            // Get grid dimensions
-            dim3 spmv_grid_size(
-                CUB_MIN(num_merge_tiles, max_dim_x),
-                (num_merge_tiles + max_dim_x - 1) / max_dim_x,
-                1);
-
-            dim3 segment_fixup_grid_size(
-                CUB_MIN(num_segment_fixup_tiles, max_dim_x),
-                (num_segment_fixup_tiles + max_dim_x - 1) / max_dim_x,
-                1);
-
-            // Get the temporary storage allocation requirements
-            size_t allocation_sizes[3];
-            if (CubDebug(error = ScanTileStateT::AllocationSize(num_segment_fixup_tiles, allocation_sizes[0]))) break;    // bytes needed for reduce-by-key tile status descriptors
-            allocation_sizes[1] = num_merge_tiles * sizeof(KeyValuePairT);       // bytes needed for block carry-out pairs
-            allocation_sizes[2] = (num_merge_tiles + 1) * sizeof(CoordinateT);   // bytes needed for tile starting coordinates
-
-            // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
-            void* allocations[3];
-            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
-            if (d_temp_storage == NULL)
-            {
-                // Return if the caller is simply requesting the size of the storage allocation
-                break;
-            }
-
-            // Construct the tile status interface
-            ScanTileStateT tile_state;
-            if (CubDebug(error = tile_state.Init(num_segment_fixup_tiles, allocations[0], allocation_sizes[0]))) break;
-
-            // Alias the other allocations
-            KeyValuePairT*  d_tile_carry_pairs      = (KeyValuePairT*) allocations[1];  // Agent carry-out pairs
-            CoordinateT*    d_tile_coordinates      = (CoordinateT*) allocations[2];    // Agent starting coordinates
-
-            // Get search/init grid dims
-            int search_block_size   = INIT_KERNEL_THREADS;
-            int search_grid_size    = (num_merge_tiles + 1 + search_block_size - 1) / search_block_size;
-
-#if (CUB_PTX_ARCH == 0)
-            // Init textures
-            if (CubDebug(error = spmv_params.t_vector_x.BindTexture(spmv_params.d_vector_x))) break;
-#endif
-
-            if (search_grid_size < sm_count)
-//            if (num_merge_tiles < spmv_sm_occupancy * sm_count)
-            {
-                // Not enough spmv tiles to saturate the device: have spmv blocks search their own staring coords
-                d_tile_coordinates = NULL;
-            }
-            else
-            {
-                // Use separate search kernel if we have enough spmv tiles to saturate the device
-
-                // Log spmv_search_kernel configuration
-                if (debug_synchronous) _CubLog("Invoking spmv_search_kernel<<<%d, %d, 0, %lld>>>()\n",
-                    search_grid_size, search_block_size, (long long) stream);
-
-                // Invoke spmv_search_kernel
-                spmv_search_kernel<<<search_grid_size, search_block_size, 0, stream>>>(
-                    num_merge_tiles,
-                    d_tile_coordinates,
-                    spmv_params);
-
-                // Check for failure to launch
-                if (CubDebug(error = cudaPeekAtLastError())) break;
-
-                // Sync the stream if specified to flush runtime errors
-                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-            }
-
-            // Log spmv_kernel configuration
-            if (debug_synchronous) _CubLog("Invoking spmv_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
-                spmv_grid_size.x, spmv_grid_size.y, spmv_grid_size.z, spmv_config.block_threads, (long long) stream, spmv_config.items_per_thread, spmv_sm_occupancy);
-
-            // Invoke spmv_kernel
-            spmv_kernel<<<spmv_grid_size, spmv_config.block_threads, 0, stream>>>(
-                spmv_params,
-                d_tile_coordinates,
-                d_tile_carry_pairs,
-                num_merge_tiles,
-                tile_state,
-                num_segment_fixup_tiles);
-
-            // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
-
-            // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-
-            // Run reduce-by-key fixup if necessary
-            if (num_merge_tiles > 1)
-            {
-                // Log segment_fixup_kernel configuration
-                if (debug_synchronous) _CubLog("Invoking segment_fixup_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
-                    segment_fixup_grid_size.x, segment_fixup_grid_size.y, segment_fixup_grid_size.z, segment_fixup_config.block_threads, (long long) stream, segment_fixup_config.items_per_thread, segment_fixup_sm_occupancy);
-
-                // Invoke segment_fixup_kernel
-                segment_fixup_kernel<<<segment_fixup_grid_size, segment_fixup_config.block_threads, 0, stream>>>(
-                    d_tile_carry_pairs,
-                    spmv_params.d_vector_y,
-                    num_merge_tiles,
-                    num_segment_fixup_tiles,
-                    tile_state);
-
-                // Check for failure to launch
-                if (CubDebug(error = cudaPeekAtLastError())) break;
-
-                // Sync the stream if specified to flush runtime errors
-                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-            }
-
-#if (CUB_PTX_ARCH == 0)
-            // Free textures
-            if (CubDebug(error = spmv_params.t_vector_x.UnbindTexture())) break;
-#endif
-        }
-        while (0);
-
-        return error;
-
-#endif // CUB_RUNTIME_ENABLED
-    }
-
-
-    /**
-     * Internal dispatch routine for computing a device-wide reduction
-     */
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Dispatch(
-        void*                   d_temp_storage,                     ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&                 temp_storage_bytes,                 ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        SpmvParamsT&            spmv_params,                        ///< SpMV input parameter bundle
-        cudaStream_t            stream                  = 0,        ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                    debug_synchronous       = false)    ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get PTX version
-            int ptx_version;
-    #if (CUB_PTX_ARCH == 0)
-            if (CubDebug(error = PtxVersion(ptx_version))) break;
-    #else
-            ptx_version = CUB_PTX_ARCH;
-    #endif
-
-            // Get kernel kernel dispatch configurations
-            KernelConfig spmv_config, segment_fixup_config;
-            InitConfigs(ptx_version, spmv_config, segment_fixup_config);
-
-            if (CubDebug(error = Dispatch(
-                d_temp_storage, temp_storage_bytes, spmv_params, stream, debug_synchronous,
-                DeviceSpmv1ColKernel<PtxSpmvPolicyT, ValueT, OffsetT>,
-                DeviceSpmvSearchKernel<PtxSpmvPolicyT, OffsetT, CoordinateT, SpmvParamsT>,
-                DeviceSpmvKernel<PtxSpmvPolicyT, ScanTileStateT, ValueT, OffsetT, CoordinateT, false, false>,
-                DeviceSegmentFixupKernel<PtxSegmentFixupPolicy, KeyValuePairT*, ValueT*, OffsetT, ScanTileStateT>,
-                spmv_config, segment_fixup_config))) break;
-
-        }
-        while (0);
-
-        return error;
-    }
-};
-
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/thrust/system/cuda/detail/cub/grid/grid_barrier.cuh b/thrust/system/cuda/detail/cub/grid/grid_barrier.cuh
deleted file mode 100644
index 5b12c66ed..000000000
--- a/thrust/system/cuda/detail/cub/grid/grid_barrier.cuh
+++ /dev/null
@@ -1,211 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::GridBarrier implements a software global barrier among thread blocks within a CUDA grid
- */
-
-#pragma once
-
-#include "../util_debug.cuh"
-#include "../util_namespace.cuh"
-#include "../thread/thread_load.cuh"
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup GridModule
- * @{
- */
-
-
-/**
- * \brief GridBarrier implements a software global barrier among thread blocks within a CUDA grid
- */
-class GridBarrier
-{
-protected :
-
-    typedef unsigned int SyncFlag;
-
-    // Counters in global device memory
-    SyncFlag* d_sync;
-
-public:
-
-    /**
-     * Constructor
-     */
-    GridBarrier() : d_sync(NULL) {}
-
-
-    /**
-     * Synchronize
-     */
-    __device__ __forceinline__ void Sync() const
-    {
-        volatile SyncFlag *d_vol_sync = d_sync;
-
-        // Threadfence and syncthreads to make sure global writes are visible before
-        // thread-0 reports in with its sync counter
-        __threadfence();
-        CTA_SYNC();
-
-        if (blockIdx.x == 0)
-        {
-            // Report in ourselves
-            if (threadIdx.x == 0)
-            {
-                d_vol_sync[blockIdx.x] = 1;
-            }
-
-            CTA_SYNC();
-
-            // Wait for everyone else to report in
-            for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x)
-            {
-                while (ThreadLoad<LOAD_CG>(d_sync + peer_block) == 0)
-                {
-                    __threadfence_block();
-                }
-            }
-
-            CTA_SYNC();
-
-            // Let everyone know it's safe to proceed
-            for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x)
-            {
-                d_vol_sync[peer_block] = 0;
-            }
-        }
-        else
-        {
-            if (threadIdx.x == 0)
-            {
-                // Report in
-                d_vol_sync[blockIdx.x] = 1;
-
-                // Wait for acknowledgment
-                while (ThreadLoad<LOAD_CG>(d_sync + blockIdx.x) == 1)
-                {
-                    __threadfence_block();
-                }
-            }
-
-            CTA_SYNC();
-        }
-    }
-};
-
-
-/**
- * \brief GridBarrierLifetime extends GridBarrier to provide lifetime management of the temporary device storage needed for cooperation.
- *
- * Uses RAII for lifetime, i.e., device resources are reclaimed when
- * the destructor is called.
- */
-class GridBarrierLifetime : public GridBarrier
-{
-protected:
-
-    // Number of bytes backed by d_sync
-    size_t sync_bytes;
-
-public:
-
-    /**
-     * Constructor
-     */
-    GridBarrierLifetime() : GridBarrier(), sync_bytes(0) {}
-
-
-    /**
-     * DeviceFrees and resets the progress counters
-     */
-    cudaError_t HostReset()
-    {
-        cudaError_t retval = cudaSuccess;
-        if (d_sync)
-        {
-            CubDebug(retval = cudaFree(d_sync));
-            d_sync = NULL;
-        }
-        sync_bytes = 0;
-        return retval;
-    }
-
-
-    /**
-     * Destructor
-     */
-    virtual ~GridBarrierLifetime()
-    {
-        HostReset();
-    }
-
-
-    /**
-     * Sets up the progress counters for the next kernel launch (lazily
-     * allocating and initializing them if necessary)
-     */
-    cudaError_t Setup(int sweep_grid_size)
-    {
-        cudaError_t retval = cudaSuccess;
-        do {
-            size_t new_sync_bytes = sweep_grid_size * sizeof(SyncFlag);
-            if (new_sync_bytes > sync_bytes)
-            {
-                if (d_sync)
-                {
-                    if (CubDebug(retval = cudaFree(d_sync))) break;
-                }
-
-                sync_bytes = new_sync_bytes;
-
-                // Allocate and initialize to zero
-                if (CubDebug(retval = cudaMalloc((void**) &d_sync, sync_bytes))) break;
-                if (CubDebug(retval = cudaMemset(d_sync, 0, new_sync_bytes))) break;
-            }
-        } while (0);
-
-        return retval;
-    }
-};
-
-
-/** @} */       // end group GridModule
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/grid/grid_even_share.cuh b/thrust/system/cuda/detail/cub/grid/grid_even_share.cuh
deleted file mode 100644
index 59fe5c909..000000000
--- a/thrust/system/cuda/detail/cub/grid/grid_even_share.cuh
+++ /dev/null
@@ -1,222 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::GridEvenShare is a descriptor utility for distributing input among CUDA thread blocks in an "even-share" fashion.  Each thread block gets roughly the same number of fixed-size work units (grains).
- */
-
-
-#pragma once
-
-#include "../util_namespace.cuh"
-#include "../util_macro.cuh"
-#include "grid_mapping.cuh"
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup GridModule
- * @{
- */
-
-
-/**
- * \brief GridEvenShare is a descriptor utility for distributing input among
- * CUDA thread blocks in an "even-share" fashion.  Each thread block gets roughly
- * the same number of input tiles.
- *
- * \par Overview
- * Each thread block is assigned a consecutive sequence of input tiles.  To help
- * preserve alignment and eliminate the overhead of guarded loads for all but the
- * last thread block, to GridEvenShare assigns one of three different amounts of
- * work to a given thread block: "big", "normal", or "last".  The "big" workloads
- * are one scheduling grain larger than "normal".  The "last" work unit for the
- * last thread block may be partially-full if the input is not an even multiple of
- * the scheduling grain size.
- *
- * \par
- * Before invoking a child grid, a parent thread will typically construct an
- * instance of GridEvenShare.  The instance can be passed to child thread blocks
- * which can initialize their per-thread block offsets using \p BlockInit().
- */
-template <typename OffsetT>
-struct GridEvenShare
-{
-private:
-
-    OffsetT     total_tiles;
-    int         big_shares;
-    OffsetT     big_share_items;
-    OffsetT     normal_share_items;
-    OffsetT     normal_base_offset;
-
-public:
-
-    /// Total number of input items
-    OffsetT     num_items;
-
-    /// Grid size in thread blocks
-    int         grid_size;
-
-    /// OffsetT into input marking the beginning of the owning thread block's segment of input tiles
-    OffsetT     block_offset;
-
-    /// OffsetT into input of marking the end (one-past) of the owning thread block's segment of input tiles
-    OffsetT     block_end;
-
-    /// Stride between input tiles
-    OffsetT     block_stride;
-
-
-    /**
-     * \brief Constructor.
-     */
-    __host__ __device__ __forceinline__ GridEvenShare() :
-        total_tiles(0),
-        big_shares(0),
-        big_share_items(0),
-        normal_share_items(0),
-        normal_base_offset(0),
-        num_items(0),
-        grid_size(0),
-        block_offset(0),
-        block_end(0),
-        block_stride(0)
-    {}
-
-
-    /**
-     * \brief Dispatch initializer. To be called prior prior to kernel launch.
-     */
-    __host__ __device__ __forceinline__ void DispatchInit(
-        OffsetT num_items,          ///< Total number of input items
-        int     max_grid_size,      ///< Maximum grid size allowable (actual grid size may be less if not warranted by the the number of input items)
-        int     tile_items)         ///< Number of data items per input tile
-    {
-        this->block_offset          = num_items;    // Initialize past-the-end
-        this->block_end             = num_items;    // Initialize past-the-end
-        this->num_items             = num_items;
-        this->total_tiles           = (num_items + tile_items - 1) / tile_items;
-        this->grid_size             = CUB_MIN(total_tiles, max_grid_size);
-        OffsetT avg_tiles_per_block = total_tiles / grid_size;
-        this->big_shares            = total_tiles - (avg_tiles_per_block * grid_size);        // leftover grains go to big blocks
-        this->normal_share_items    = avg_tiles_per_block * tile_items;
-        this->normal_base_offset    = big_shares * tile_items;
-        this->big_share_items       = normal_share_items + tile_items;
-    }
-
-
-    /**
-     * \brief Initializes ranges for the specified thread block index.  Specialized
-     * for a "raking" access pattern in which each thread block is assigned a
-     * consecutive sequence of input tiles.
-     */
-    template <int TILE_ITEMS>
-    __device__ __forceinline__ void BlockInit(
-        int block_id,
-        Int2Type<GRID_MAPPING_RAKE> /*strategy_tag*/)
-    {
-        block_stride = TILE_ITEMS;
-        if (block_id < big_shares)
-        {
-            // This thread block gets a big share of grains (avg_tiles_per_block + 1)
-            block_offset = (block_id * big_share_items);
-            block_end = block_offset + big_share_items;
-        }
-        else if (block_id < total_tiles)
-        {
-            // This thread block gets a normal share of grains (avg_tiles_per_block)
-            block_offset = normal_base_offset + (block_id * normal_share_items);
-            block_end = CUB_MIN(num_items, block_offset + normal_share_items);
-        }
-        // Else default past-the-end
-    }
-
-
-    /**
-     * \brief Block-initialization, specialized for a "raking" access
-     * pattern in which each thread block is assigned a consecutive sequence
-     * of input tiles.
-     */
-    template <int TILE_ITEMS>
-    __device__ __forceinline__ void BlockInit(
-        int block_id,
-        Int2Type<GRID_MAPPING_STRIP_MINE> /*strategy_tag*/)
-    {
-        block_stride = grid_size * TILE_ITEMS;
-        block_offset = (block_id * TILE_ITEMS);
-        block_end = num_items;
-    }
-
-
-    /**
-     * \brief Block-initialization, specialized for "strip mining" access
-     * pattern in which the input tiles assigned to each thread block are
-     * separated by a stride equal to the the extent of the grid.
-     */
-    template <
-        int TILE_ITEMS,
-        GridMappingStrategy STRATEGY>
-    __device__ __forceinline__ void BlockInit()
-    {
-        BlockInit<TILE_ITEMS>(blockIdx.x, Int2Type<STRATEGY>());
-    }
-
-
-    /**
-     * \brief Block-initialization, specialized for a "raking" access
-     * pattern in which each thread block is assigned a consecutive sequence
-     * of input tiles.
-     */
-    template <int TILE_ITEMS>
-    __device__ __forceinline__ void BlockInit(
-        OffsetT block_offset,                       ///< [in] Threadblock begin offset (inclusive)
-        OffsetT block_end)                          ///< [in] Threadblock end offset (exclusive)
-    {
-        this->block_offset = block_offset;
-        this->block_end = block_end;
-        this->block_stride = TILE_ITEMS;
-    }
-
-
-};
-
-
-
-
-
-/** @} */       // end group GridModule
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/system/cuda/detail/cub/grid/grid_mapping.cuh b/thrust/system/cuda/detail/cub/grid/grid_mapping.cuh
deleted file mode 100644
index 6d1ab5846..000000000
--- a/thrust/system/cuda/detail/cub/grid/grid_mapping.cuh
+++ /dev/null
@@ -1,113 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks.
- */
-
-#pragma once
-
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup GridModule
- * @{
- */
-
-
-/******************************************************************************
- * Mapping policies
- *****************************************************************************/
-
-
-/**
- * \brief cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks.
- */
-enum GridMappingStrategy
-{
-    /**
-     * \brief An a "raking" access pattern in which each thread block is
-     * assigned a consecutive sequence of input tiles
-     *
-     * \par Overview
-     * The input is evenly partitioned into \p p segments, where \p p is
-     * constant and corresponds loosely to the number of thread blocks that may
-     * actively reside on the target device. Each segment is comprised of
-     * consecutive tiles, where a tile is a small, constant-sized unit of input
-     * to be processed to completion before the thread block terminates or
-     * obtains more work.  The kernel invokes \p p thread blocks, each
-     * of which iteratively consumes a segment of <em>n</em>/<em>p</em> elements
-     * in tile-size increments.
-     */
-    GRID_MAPPING_RAKE,
-
-    /**
-     * \brief An a "strip mining" access pattern in which the input tiles assigned
-     * to each thread block are separated by a stride equal to the the extent of
-     * the grid.
-     *
-     * \par Overview
-     * The input is evenly partitioned into \p p sets, where \p p is
-     * constant and corresponds loosely to the number of thread blocks that may
-     * actively reside on the target device. Each set is comprised of
-     * data tiles separated by stride \p tiles, where a tile is a small,
-     * constant-sized unit of input to be processed to completion before the
-     * thread block terminates or obtains more work.  The kernel invokes \p p
-     * thread blocks, each of which iteratively consumes a segment of
-     * <em>n</em>/<em>p</em> elements in tile-size increments.
-     */
-    GRID_MAPPING_STRIP_MINE,
-
-    /**
-     * \brief A dynamic "queue-based" strategy for assigning input tiles to thread blocks.
-     *
-     * \par Overview
-     * The input is treated as a queue to be dynamically consumed by a grid of
-     * thread blocks.  Work is atomically dequeued in tiles, where a tile is a
-     * unit of input to be processed to completion before the thread block
-     * terminates or obtains more work.  The grid size \p p is constant,
-     * loosely corresponding to the number of thread blocks that may actively
-     * reside on the target device.
-     */
-    GRID_MAPPING_DYNAMIC,
-};
-
-
-/** @} */       // end group GridModule
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/grid/grid_queue.cuh b/thrust/system/cuda/detail/cub/grid/grid_queue.cuh
deleted file mode 100644
index 3c5330e4a..000000000
--- a/thrust/system/cuda/detail/cub/grid/grid_queue.cuh
+++ /dev/null
@@ -1,220 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::GridQueue is a descriptor utility for dynamic queue management.
- */
-
-#pragma once
-
-#include "../util_namespace.cuh"
-#include "../util_debug.cuh"
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup GridModule
- * @{
- */
-
-
-/**
- * \brief GridQueue is a descriptor utility for dynamic queue management.
- *
- * \par Overview
- * GridQueue descriptors provides abstractions for "filling" or
- * "draining" globally-shared vectors.
- *
- * \par
- * A "filling" GridQueue works by atomically-adding to a zero-initialized counter,
- * returning a unique offset for the calling thread to write its items.
- * The GridQueue maintains the total "fill-size".  The fill counter must be reset
- * using GridQueue::ResetFill by the host or kernel instance prior to the kernel instance that
- * will be filling.
- *
- * \par
- * Similarly, a "draining" GridQueue works by works by atomically-incrementing a
- * zero-initialized counter, returning a unique offset for the calling thread to
- * read its items. Threads can safely drain until the array's logical fill-size is
- * exceeded.  The drain counter must be reset using GridQueue::ResetDrain or
- * GridQueue::FillAndResetDrain by the host or kernel instance prior to the kernel instance that
- * will be filling.  (For dynamic work distribution of existing data, the corresponding fill-size
- * is simply the number of elements in the array.)
- *
- * \par
- * Iterative work management can be implemented simply with a pair of flip-flopping
- * work buffers, each with an associated set of fill and drain GridQueue descriptors.
- *
- * \tparam OffsetT Signed integer type for global offsets
- */
-template <typename OffsetT>
-class GridQueue
-{
-private:
-
-    /// Counter indices
-    enum
-    {
-        FILL    = 0,
-        DRAIN   = 1,
-    };
-
-    /// Pair of counters
-    OffsetT *d_counters;
-
-public:
-
-    /// Returns the device allocation size in bytes needed to construct a GridQueue instance
-    __host__ __device__ __forceinline__
-    static size_t AllocationSize()
-    {
-        return sizeof(OffsetT) * 2;
-    }
-
-
-    /// Constructs an invalid GridQueue descriptor
-    __host__ __device__ __forceinline__ GridQueue()
-    :
-        d_counters(NULL)
-    {}
-
-
-    /// Constructs a GridQueue descriptor around the device storage allocation
-    __host__ __device__ __forceinline__ GridQueue(
-        void *d_storage)                    ///< Device allocation to back the GridQueue.  Must be at least as big as <tt>AllocationSize()</tt>.
-    :
-        d_counters((OffsetT*) d_storage)
-    {}
-
-
-    /// This operation sets the fill-size and resets the drain counter, preparing the GridQueue for draining in the next kernel instance.  To be called by the host or by a kernel prior to that which will be draining.
-    __host__ __device__ __forceinline__ cudaError_t FillAndResetDrain(
-        OffsetT fill_size,
-        cudaStream_t stream = 0)
-    {
-#if (CUB_PTX_ARCH > 0)
-        (void)stream;
-        d_counters[FILL] = fill_size;
-        d_counters[DRAIN] = 0;
-        return cudaSuccess;
-#else
-        OffsetT counters[2];
-        counters[FILL] = fill_size;
-        counters[DRAIN] = 0;
-        return CubDebug(cudaMemcpyAsync(d_counters, counters, sizeof(OffsetT) * 2, cudaMemcpyHostToDevice, stream));
-#endif
-    }
-
-
-    /// This operation resets the drain so that it may advance to meet the existing fill-size.  To be called by the host or by a kernel prior to that which will be draining.
-    __host__ __device__ __forceinline__ cudaError_t ResetDrain(cudaStream_t stream = 0)
-    {
-#if (CUB_PTX_ARCH > 0)
-        (void)stream;
-        d_counters[DRAIN] = 0;
-        return cudaSuccess;
-#else
-        return CubDebug(cudaMemsetAsync(d_counters + DRAIN, 0, sizeof(OffsetT), stream));
-#endif
-    }
-
-
-    /// This operation resets the fill counter.  To be called by the host or by a kernel prior to that which will be filling.
-    __host__ __device__ __forceinline__ cudaError_t ResetFill(cudaStream_t stream = 0)
-    {
-#if (CUB_PTX_ARCH > 0)
-        (void)stream;
-        d_counters[FILL] = 0;
-        return cudaSuccess;
-#else
-        return CubDebug(cudaMemsetAsync(d_counters + FILL, 0, sizeof(OffsetT), stream));
-#endif
-    }
-
-
-    /// Returns the fill-size established by the parent or by the previous kernel.
-    __host__ __device__ __forceinline__ cudaError_t FillSize(
-        OffsetT &fill_size,
-        cudaStream_t stream = 0)
-    {
-#if (CUB_PTX_ARCH > 0)
-        (void)stream;
-        fill_size = d_counters[FILL];
-        return cudaSuccess;
-#else
-        return CubDebug(cudaMemcpyAsync(&fill_size, d_counters + FILL, sizeof(OffsetT), cudaMemcpyDeviceToHost, stream));
-#endif
-    }
-
-
-    /// Drain \p num_items from the queue.  Returns offset from which to read items.  To be called from CUDA kernel.
-    __device__ __forceinline__ OffsetT Drain(OffsetT num_items)
-    {
-        return atomicAdd(d_counters + DRAIN, num_items);
-    }
-
-
-    /// Fill \p num_items into the queue.  Returns offset from which to write items.    To be called from CUDA kernel.
-    __device__ __forceinline__ OffsetT Fill(OffsetT num_items)
-    {
-        return atomicAdd(d_counters + FILL, num_items);
-    }
-};
-
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-
-/**
- * Reset grid queue (call with 1 block of 1 thread)
- */
-template <typename OffsetT>
-__global__ void FillAndResetDrainKernel(
-    GridQueue<OffsetT>   grid_queue,
-    OffsetT              num_items)
-{
-    grid_queue.FillAndResetDrain(num_items);
-}
-
-
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-/** @} */       // end group GridModule
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/thrust/system/cuda/detail/cub/host/mutex.cuh b/thrust/system/cuda/detail/cub/host/mutex.cuh
deleted file mode 100644
index 30d64b7d4..000000000
--- a/thrust/system/cuda/detail/cub/host/mutex.cuh
+++ /dev/null
@@ -1,171 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Simple portable mutex
- */
-
-
-#pragma once
-
-#if (__cplusplus > 199711L) || (defined(_MSC_VER) && _MSC_VER >= 1800)
-    #include <mutex>
-#else
-    #if defined(_WIN32) || defined(_WIN64)
-        #include <intrin.h>
-
-        #define WIN32_LEAN_AND_MEAN
-        #define NOMINMAX
-        #include <windows.h>
-        #undef WIN32_LEAN_AND_MEAN
-        #undef NOMINMAX
-
-        /**
-         * Compiler read/write barrier
-         */
-        #pragma intrinsic(_ReadWriteBarrier)
-
-    #endif
-#endif
-
-#include "../util_namespace.cuh"
-
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * Simple portable mutex
- *   - Wraps std::mutex when compiled with C++11 or newer (supported on all platforms)
- *   - Uses GNU/Windows spinlock mechanisms for pre C++11 (supported on x86/x64 when compiled with cl.exe or g++)
- */
-struct Mutex
-{
-#if (__cplusplus > 199711L) || (defined(_MSC_VER) && _MSC_VER >= 1800)
-
-    std::mutex mtx;
-
-    void Lock()
-    {
-        mtx.lock();
-    }
-
-    void Unlock()
-    {
-        mtx.unlock();
-    }
-
-    void TryLock()
-    {
-        mtx.try_lock();
-    }
-
-#else       //__cplusplus > 199711L
-
-    #if defined(_MSC_VER)
-
-        // Microsoft VC++
-        typedef long Spinlock;
-
-    #else
-
-        // GNU g++
-        typedef int Spinlock;
-
-        /**
-         * Compiler read/write barrier
-         */
-        __forceinline__ void _ReadWriteBarrier()
-        {
-            __sync_synchronize();
-        }
-
-        /**
-         * Atomic exchange
-         */
-        __forceinline__ long _InterlockedExchange(volatile int * const Target, const int Value)
-        {
-            // NOTE: __sync_lock_test_and_set would be an acquire barrier, so we force a full barrier
-            _ReadWriteBarrier();
-            return __sync_lock_test_and_set(Target, Value);
-        }
-
-        /**
-         * Pause instruction to prevent excess processor bus usage
-         */
-        __forceinline__ void YieldProcessor()
-        {
-        }
-
-    #endif  // defined(_MSC_VER)
-
-        /// Lock member
-        volatile Spinlock lock;
-
-        /**
-         * Constructor
-         */
-        Mutex() : lock(0) {}
-
-        /**
-         * Return when the specified spinlock has been acquired
-         */
-        __forceinline__ void Lock()
-        {
-            while (1)
-            {
-                if (!_InterlockedExchange(&lock, 1)) return;
-                while (lock) YieldProcessor();
-            }
-        }
-
-
-        /**
-         * Release the specified spinlock
-         */
-        __forceinline__ void Unlock()
-        {
-            _ReadWriteBarrier();
-            lock = 0;
-        }
-
-#endif      // __cplusplus > 199711L
-
-};
-
-
-
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/iterator/arg_index_input_iterator.cuh b/thrust/system/cuda/detail/cub/iterator/arg_index_input_iterator.cuh
deleted file mode 100644
index e527202e4..000000000
--- a/thrust/system/cuda/detail/cub/iterator/arg_index_input_iterator.cuh
+++ /dev/null
@@ -1,259 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Random-access iterator types
- */
-
-#pragma once
-
-#include <iterator>
-#include <iostream>
-
-#include "../thread/thread_load.cuh"
-#include "../thread/thread_store.cuh"
-#include "../util_device.cuh"
-#include "../util_namespace.cuh"
-
-#include <thrust/version.h>
-
-#if (THRUST_VERSION >= 100700)
-    // This iterator is compatible with Thrust API 1.7 and newer
-    #include <thrust/iterator/iterator_facade.h>
-    #include <thrust/iterator/iterator_traits.h>
-#endif // THRUST_VERSION
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \addtogroup UtilIterator
- * @{
- */
-
-
-/**
- * \brief A random-access input wrapper for pairing dereferenced values with their corresponding indices (forming \p KeyValuePair tuples).
- *
- * \par Overview
- * - ArgIndexInputIteratorTwraps a random access input iterator \p itr of type \p InputIteratorT.
- *   Dereferencing an ArgIndexInputIteratorTat offset \p i produces a \p KeyValuePair value whose
- *   \p key field is \p i and whose \p value field is <tt>itr[i]</tt>.
- * - Can be used with any data type.
- * - Can be constructed, manipulated, and exchanged within and between host and device
- *   functions.  Wrapped host memory can only be dereferenced on the host, and wrapped
- *   device memory can only be dereferenced on the device.
- * - Compatible with Thrust API v1.7 or newer.
- *
- * \par Snippet
- * The code snippet below illustrates the use of \p ArgIndexInputIteratorTto
- * dereference an array of doubles
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/iterator/arg_index_input_iterator.cuh>
- *
- * // Declare, allocate, and initialize a device array
- * double *d_in;         // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0]
- *
- * // Create an iterator wrapper
- * cub::ArgIndexInputIterator<double*> itr(d_in);
- *
- * // Within device code:
- * typedef typename cub::ArgIndexInputIterator<double*>::value_type Tuple;
- * Tuple item_offset_pair.key = *itr;
- * printf("%f @ %d\n",
- *   item_offset_pair.value,
- *   item_offset_pair.key);   // 8.0 @ 0
- *
- * itr = itr + 6;
- * item_offset_pair.key = *itr;
- * printf("%f @ %d\n",
- *   item_offset_pair.value,
- *   item_offset_pair.key);   // 9.0 @ 6
- *
- * \endcode
- *
- * \tparam InputIteratorT       The value type of the wrapped input iterator
- * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
- * \tparam OutputValueT         The paired value type of the <offset,value> tuple (Default: value type of input iterator)
- */
-template <
-    typename    InputIteratorT,
-    typename    OffsetT             = ptrdiff_t,
-    typename    OutputValueT        = typename std::iterator_traits<InputIteratorT>::value_type>
-class ArgIndexInputIterator
-{
-public:
-
-    // Required iterator traits
-    typedef ArgIndexInputIterator                       self_type;              ///< My own type
-    typedef OffsetT                                     difference_type;        ///< Type to express the result of subtracting one iterator from another
-    typedef KeyValuePair<difference_type, OutputValueT> value_type;             ///< The type of the element the iterator can point to
-    typedef value_type*                                 pointer;                ///< The type of a pointer to an element the iterator can point to
-    typedef value_type                                  reference;              ///< The type of a reference to an element the iterator can point to
-
-#if (THRUST_VERSION >= 100700)
-    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
-    typedef typename thrust::detail::iterator_facade_category<
-        thrust::any_system_tag,
-        thrust::random_access_traversal_tag,
-        value_type,
-        reference
-      >::type iterator_category;                                        ///< The iterator category
-#else
-    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
-#endif  // THRUST_VERSION
-
-private:
-
-    InputIteratorT  itr;
-    difference_type offset;
-
-public:
-
-    /// Constructor
-    __host__ __device__ __forceinline__ ArgIndexInputIterator(
-        InputIteratorT  itr,            ///< Input iterator to wrap
-        difference_type offset = 0)     ///< OffsetT (in items) from \p itr denoting the position of the iterator
-    :
-        itr(itr),
-        offset(offset)
-    {}
-
-    /// Postfix increment
-    __host__ __device__ __forceinline__ self_type operator++(int)
-    {
-        self_type retval = *this;
-        offset++;
-        return retval;
-    }
-
-    /// Prefix increment
-    __host__ __device__ __forceinline__ self_type operator++()
-    {
-        offset++;
-        return *this;
-    }
-
-    /// Indirection
-    __host__ __device__ __forceinline__ reference operator*() const
-    {
-        value_type retval;
-        retval.value = itr[offset];
-        retval.key = offset;
-        return retval;
-    }
-
-    /// Addition
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
-    {
-        self_type retval(itr, offset + n);
-        return retval;
-    }
-
-    /// Addition assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
-    {
-        offset += n;
-        return *this;
-    }
-
-    /// Subtraction
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
-    {
-        self_type retval(itr, offset - n);
-        return retval;
-    }
-
-    /// Subtraction assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
-    {
-        offset -= n;
-        return *this;
-    }
-
-    /// Distance
-    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
-    {
-        return offset - other.offset;
-    }
-
-    /// Array subscript
-    template <typename Distance>
-    __host__ __device__ __forceinline__ reference operator[](Distance n) const
-    {
-        self_type offset = (*this) + n;
-        return *offset;
-    }
-
-    /// Structure dereference
-    __host__ __device__ __forceinline__ pointer operator->()
-    {
-        return &(*(*this));
-    }
-
-    /// Equal to
-    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
-    {
-        return ((itr == rhs.itr) && (offset == rhs.offset));
-    }
-
-    /// Not equal to
-    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
-    {
-        return ((itr != rhs.itr) || (offset != rhs.offset));
-    }
-
-    /// Normalize
-    __host__ __device__ __forceinline__ void normalize()
-    {
-        itr += offset;
-        offset = 0;
-    }
-
-    /// ostream operator
-    friend std::ostream& operator<<(std::ostream& os, const self_type& /*itr*/)
-    {
-        return os;
-    }
-};
-
-
-
-/** @} */       // end group UtilIterator
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/system/cuda/detail/cub/iterator/cache_modified_input_iterator.cuh b/thrust/system/cuda/detail/cub/iterator/cache_modified_input_iterator.cuh
deleted file mode 100644
index 012a32180..000000000
--- a/thrust/system/cuda/detail/cub/iterator/cache_modified_input_iterator.cuh
+++ /dev/null
@@ -1,240 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Random-access iterator types
- */
-
-#pragma once
-
-#include <iterator>
-#include <iostream>
-
-#include "../thread/thread_load.cuh"
-#include "../thread/thread_store.cuh"
-#include "../util_device.cuh"
-#include "../util_namespace.cuh"
-
-#if (THRUST_VERSION >= 100700)
-    // This iterator is compatible with Thrust API 1.7 and newer
-    #include <thrust/iterator/iterator_facade.h>
-    #include <thrust/iterator/iterator_traits.h>
-#endif // THRUST_VERSION
-
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-
-/**
- * \addtogroup UtilIterator
- * @{
- */
-
-
-/**
- * \brief A random-access input wrapper for dereferencing array values using a PTX cache load modifier.
- *
- * \par Overview
- * - CacheModifiedInputIteratorTis a random-access input iterator that wraps a native
- *   device pointer of type <tt>ValueType*</tt>. \p ValueType references are
- *   made by reading \p ValueType values through loads modified by \p MODIFIER.
- * - Can be used to load any data type from memory using PTX cache load modifiers (e.g., "LOAD_LDG",
- *   "LOAD_CG", "LOAD_CA", "LOAD_CS", "LOAD_CV", etc.).
- * - Can be constructed, manipulated, and exchanged within and between host and device
- *   functions, but can only be dereferenced within device functions.
- * - Compatible with Thrust API v1.7 or newer.
- *
- * \par Snippet
- * The code snippet below illustrates the use of \p CacheModifiedInputIteratorTto
- * dereference a device array of double using the "ldg" PTX load modifier
- * (i.e., load values through texture cache).
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/iterator/cache_modified_input_iterator.cuh>
- *
- * // Declare, allocate, and initialize a device array
- * double *d_in;            // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0]
- *
- * // Create an iterator wrapper
- * cub::CacheModifiedInputIterator<cub::LOAD_LDG, double> itr(d_in);
- *
- * // Within device code:
- * printf("%f\n", itr[0]);  // 8.0
- * printf("%f\n", itr[1]);  // 6.0
- * printf("%f\n", itr[6]);  // 9.0
- *
- * \endcode
- *
- * \tparam CacheLoadModifier    The cub::CacheLoadModifier to use when accessing data
- * \tparam ValueType            The value type of this iterator
- * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
- */
-template <
-    CacheLoadModifier   MODIFIER,
-    typename            ValueType,
-    typename            OffsetT = ptrdiff_t>
-class CacheModifiedInputIterator
-{
-public:
-
-    // Required iterator traits
-    typedef CacheModifiedInputIterator          self_type;              ///< My own type
-    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
-    typedef ValueType                           value_type;             ///< The type of the element the iterator can point to
-    typedef ValueType*                          pointer;                ///< The type of a pointer to an element the iterator can point to
-    typedef ValueType                           reference;              ///< The type of a reference to an element the iterator can point to
-
-#if (THRUST_VERSION >= 100700)
-    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
-    typedef typename thrust::detail::iterator_facade_category<
-        thrust::device_system_tag,
-        thrust::random_access_traversal_tag,
-        value_type,
-        reference
-      >::type iterator_category;                                        ///< The iterator category
-#else
-    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
-#endif  // THRUST_VERSION
-
-
-public:
-
-    /// Wrapped native pointer
-    ValueType* ptr;
-
-    /// Constructor
-    template <typename QualifiedValueType>
-    __host__ __device__ __forceinline__ CacheModifiedInputIterator(
-        QualifiedValueType* ptr)     ///< Native pointer to wrap
-    :
-        ptr(const_cast<typename RemoveQualifiers<QualifiedValueType>::Type *>(ptr))
-    {}
-
-    /// Postfix increment
-    __host__ __device__ __forceinline__ self_type operator++(int)
-    {
-        self_type retval = *this;
-        ptr++;
-        return retval;
-    }
-
-    /// Prefix increment
-    __host__ __device__ __forceinline__ self_type operator++()
-    {
-        ptr++;
-        return *this;
-    }
-
-    /// Indirection
-    __device__ __forceinline__ reference operator*() const
-    {
-        return ThreadLoad<MODIFIER>(ptr);
-    }
-
-    /// Addition
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
-    {
-        self_type retval(ptr + n);
-        return retval;
-    }
-
-    /// Addition assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
-    {
-        ptr += n;
-        return *this;
-    }
-
-    /// Subtraction
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
-    {
-        self_type retval(ptr - n);
-        return retval;
-    }
-
-    /// Subtraction assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
-    {
-        ptr -= n;
-        return *this;
-    }
-
-    /// Distance
-    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
-    {
-        return ptr - other.ptr;
-    }
-
-    /// Array subscript
-    template <typename Distance>
-    __device__ __forceinline__ reference operator[](Distance n) const
-    {
-        return ThreadLoad<MODIFIER>(ptr + n);
-    }
-
-    /// Structure dereference
-    __device__ __forceinline__ pointer operator->()
-    {
-        return &ThreadLoad<MODIFIER>(ptr);
-    }
-
-    /// Equal to
-    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
-    {
-        return (ptr == rhs.ptr);
-    }
-
-    /// Not equal to
-    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
-    {
-        return (ptr != rhs.ptr);
-    }
-
-    /// ostream operator
-    friend std::ostream& operator<<(std::ostream& os, const self_type& /*itr*/)
-    {
-        return os;
-    }
-};
-
-
-
-/** @} */       // end group UtilIterator
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/system/cuda/detail/cub/iterator/cache_modified_output_iterator.cuh b/thrust/system/cuda/detail/cub/iterator/cache_modified_output_iterator.cuh
deleted file mode 100644
index 9038fed64..000000000
--- a/thrust/system/cuda/detail/cub/iterator/cache_modified_output_iterator.cuh
+++ /dev/null
@@ -1,254 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Random-access iterator types
- */
-
-#pragma once
-
-#include <iterator>
-#include <iostream>
-
-#include "../thread/thread_load.cuh"
-#include "../thread/thread_store.cuh"
-#include "../util_device.cuh"
-#include "../util_namespace.cuh"
-
-#if (THRUST_VERSION >= 100700)
-    // This iterator is compatible with Thrust API 1.7 and newer
-    #include <thrust/iterator/iterator_facade.h>
-    #include <thrust/iterator/iterator_traits.h>
-#endif // THRUST_VERSION
-
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup UtilIterator
- * @{
- */
-
-
-/**
- * \brief A random-access output wrapper for storing array values using a PTX cache-modifier.
- *
- * \par Overview
- * - CacheModifiedOutputIterator is a random-access output iterator that wraps a native
- *   device pointer of type <tt>ValueType*</tt>. \p ValueType references are
- *   made by writing \p ValueType values through stores modified by \p MODIFIER.
- * - Can be used to store any data type to memory using PTX cache store modifiers (e.g., "STORE_WB",
- *   "STORE_CG", "STORE_CS", "STORE_WT", etc.).
- * - Can be constructed, manipulated, and exchanged within and between host and device
- *   functions, but can only be dereferenced within device functions.
- * - Compatible with Thrust API v1.7 or newer.
- *
- * \par Snippet
- * The code snippet below illustrates the use of \p CacheModifiedOutputIterator to
- * dereference a device array of doubles using the "wt" PTX load modifier
- * (i.e., write-through to system memory).
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/iterator/cache_modified_output_iterator.cuh>
- *
- * // Declare, allocate, and initialize a device array
- * double *d_out;              // e.g., [, , , , , , ]
- *
- * // Create an iterator wrapper
- * cub::CacheModifiedOutputIterator<cub::STORE_WT, double> itr(d_out);
- *
- * // Within device code:
- * itr[0]  = 8.0;
- * itr[1]  = 66.0;
- * itr[55] = 24.0;
- *
- * \endcode
- *
- * \par Usage Considerations
- * - Can only be dereferenced within device code
- *
- * \tparam CacheStoreModifier     The cub::CacheStoreModifier to use when accessing data
- * \tparam ValueType            The value type of this iterator
- * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
- */
-template <
-    CacheStoreModifier  MODIFIER,
-    typename            ValueType,
-    typename            OffsetT = ptrdiff_t>
-class CacheModifiedOutputIterator
-{
-private:
-
-    // Proxy object
-    struct Reference
-    {
-        ValueType* ptr;
-
-        /// Constructor
-        __host__ __device__ __forceinline__ Reference(ValueType* ptr) : ptr(ptr) {}
-
-        /// Assignment
-        __device__ __forceinline__ ValueType operator =(ValueType val)
-        {
-            ThreadStore<MODIFIER>(ptr, val);
-            return val;
-        }
-    };
-
-public:
-
-    // Required iterator traits
-    typedef CacheModifiedOutputIterator         self_type;              ///< My own type
-    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
-    typedef void                                value_type;             ///< The type of the element the iterator can point to
-    typedef void                                pointer;                ///< The type of a pointer to an element the iterator can point to
-    typedef Reference                           reference;              ///< The type of a reference to an element the iterator can point to
-
-#if (THRUST_VERSION >= 100700)
-    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
-    typedef typename thrust::detail::iterator_facade_category<
-        thrust::device_system_tag,
-        thrust::random_access_traversal_tag,
-        value_type,
-        reference
-      >::type iterator_category;                                        ///< The iterator category
-#else
-    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
-#endif  // THRUST_VERSION
-
-private:
-
-    ValueType* ptr;
-
-public:
-
-    /// Constructor
-    template <typename QualifiedValueType>
-    __host__ __device__ __forceinline__ CacheModifiedOutputIterator(
-        QualifiedValueType* ptr)     ///< Native pointer to wrap
-    :
-        ptr(const_cast<typename RemoveQualifiers<QualifiedValueType>::Type *>(ptr))
-    {}
-
-    /// Postfix increment
-    __host__ __device__ __forceinline__ self_type operator++(int)
-    {
-        self_type retval = *this;
-        ptr++;
-        return retval;
-    }
-
-
-    /// Prefix increment
-    __host__ __device__ __forceinline__ self_type operator++()
-    {
-        ptr++;
-        return *this;
-    }
-
-    /// Indirection
-    __host__ __device__ __forceinline__ reference operator*() const
-    {
-        return Reference(ptr);
-    }
-
-    /// Addition
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
-    {
-        self_type retval(ptr + n);
-        return retval;
-    }
-
-    /// Addition assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
-    {
-        ptr += n;
-        return *this;
-    }
-
-    /// Subtraction
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
-    {
-        self_type retval(ptr - n);
-        return retval;
-    }
-
-    /// Subtraction assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
-    {
-        ptr -= n;
-        return *this;
-    }
-
-    /// Distance
-    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
-    {
-        return ptr - other.ptr;
-    }
-
-    /// Array subscript
-    template <typename Distance>
-    __host__ __device__ __forceinline__ reference operator[](Distance n) const
-    {
-        return Reference(ptr + n);
-    }
-
-    /// Equal to
-    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
-    {
-        return (ptr == rhs.ptr);
-    }
-
-    /// Not equal to
-    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
-    {
-        return (ptr != rhs.ptr);
-    }
-
-    /// ostream operator
-    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
-    {
-        return os;
-    }
-};
-
-
-/** @} */       // end group UtilIterator
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/system/cuda/detail/cub/iterator/constant_input_iterator.cuh b/thrust/system/cuda/detail/cub/iterator/constant_input_iterator.cuh
deleted file mode 100644
index e2582db35..000000000
--- a/thrust/system/cuda/detail/cub/iterator/constant_input_iterator.cuh
+++ /dev/null
@@ -1,235 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Random-access iterator types
- */
-
-#pragma once
-
-#include <iterator>
-#include <iostream>
-
-#include "../thread/thread_load.cuh"
-#include "../thread/thread_store.cuh"
-#include "../util_namespace.cuh"
-
-#if (THRUST_VERSION >= 100700)
-    // This iterator is compatible with Thrust API 1.7 and newer
-    #include <thrust/iterator/iterator_facade.h>
-    #include <thrust/iterator/iterator_traits.h>
-#endif // THRUST_VERSION
-
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup UtilIterator
- * @{
- */
-
-
-/**
- * \brief A random-access input generator for dereferencing a sequence of homogeneous values
- *
- * \par Overview
- * - Read references to a ConstantInputIteratorTiterator always return the supplied constant
- *   of type \p ValueType.
- * - Can be used with any data type.
- * - Can be constructed, manipulated, dereferenced, and exchanged within and between host and device
- *   functions.
- * - Compatible with Thrust API v1.7 or newer.
- *
- * \par Snippet
- * The code snippet below illustrates the use of \p ConstantInputIteratorTto
- * dereference a sequence of homogeneous doubles.
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/iterator/constant_input_iterator.cuh>
- *
- * cub::ConstantInputIterator<double> itr(5.0);
- *
- * printf("%f\n", itr[0]);      // 5.0
- * printf("%f\n", itr[1]);      // 5.0
- * printf("%f\n", itr[2]);      // 5.0
- * printf("%f\n", itr[50]);     // 5.0
- *
- * \endcode
- *
- * \tparam ValueType            The value type of this iterator
- * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
- */
-template <
-    typename ValueType,
-    typename OffsetT = ptrdiff_t>
-class ConstantInputIterator
-{
-public:
-
-    // Required iterator traits
-    typedef ConstantInputIterator               self_type;              ///< My own type
-    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
-    typedef ValueType                           value_type;             ///< The type of the element the iterator can point to
-    typedef ValueType*                          pointer;                ///< The type of a pointer to an element the iterator can point to
-    typedef ValueType                           reference;              ///< The type of a reference to an element the iterator can point to
-
-#if (THRUST_VERSION >= 100700)
-    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
-    typedef typename thrust::detail::iterator_facade_category<
-        thrust::any_system_tag,
-        thrust::random_access_traversal_tag,
-        value_type,
-        reference
-      >::type iterator_category;                                        ///< The iterator category
-#else
-    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
-#endif  // THRUST_VERSION
-
-private:
-
-    ValueType   val;
-    OffsetT     offset;
-#ifdef _WIN32
-    OffsetT     pad[CUB_MAX(1, (16 / sizeof(OffsetT) - 1))];        // Workaround for win32 parameter-passing bug (ulonglong2 argmin DeviceReduce)
-#endif
-
-public:
-
-    /// Constructor
-    __host__ __device__ __forceinline__ ConstantInputIterator(
-        ValueType   val,            ///< Starting value for the iterator instance to report
-        OffsetT     offset = 0)     ///< Base offset
-    :
-        val(val),
-        offset(offset)
-    {}
-
-    /// Postfix increment
-    __host__ __device__ __forceinline__ self_type operator++(int)
-    {
-        self_type retval = *this;
-        offset++;
-        return retval;
-    }
-
-    /// Prefix increment
-    __host__ __device__ __forceinline__ self_type operator++()
-    {
-        offset++;
-        return *this;
-    }
-
-    /// Indirection
-    __host__ __device__ __forceinline__ reference operator*() const
-    {
-        return val;
-    }
-
-    /// Addition
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
-    {
-        self_type retval(val, offset + n);
-        return retval;
-    }
-
-    /// Addition assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
-    {
-        offset += n;
-        return *this;
-    }
-
-    /// Subtraction
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
-    {
-        self_type retval(val, offset - n);
-        return retval;
-    }
-
-    /// Subtraction assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
-    {
-        offset -= n;
-        return *this;
-    }
-
-    /// Distance
-    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
-    {
-        return offset - other.offset;
-    }
-
-    /// Array subscript
-    template <typename Distance>
-    __host__ __device__ __forceinline__ reference operator[](Distance /*n*/) const
-    {
-        return val;
-    }
-
-    /// Structure dereference
-    __host__ __device__ __forceinline__ pointer operator->()
-    {
-        return &val;
-    }
-
-    /// Equal to
-    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
-    {
-        return (offset == rhs.offset) && ((val == rhs.val));
-    }
-
-    /// Not equal to
-    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
-    {
-        return (offset != rhs.offset) || (val!= rhs.val);
-    }
-
-    /// ostream operator
-    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
-    {
-        os << "[" << itr.val << "," << itr.offset << "]";
-        return os;
-    }
-
-};
-
-
-/** @} */       // end group UtilIterator
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh b/thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh
deleted file mode 100644
index 69a736302..000000000
--- a/thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh
+++ /dev/null
@@ -1,228 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Random-access iterator types
- */
-
-#pragma once
-
-#include <iterator>
-#include <iostream>
-
-#include "../thread/thread_load.cuh"
-#include "../thread/thread_store.cuh"
-#include "../util_device.cuh"
-#include "../util_namespace.cuh"
-
-#if (THRUST_VERSION >= 100700)
-    // This iterator is compatible with Thrust API 1.7 and newer
-    #include <thrust/iterator/iterator_facade.h>
-    #include <thrust/iterator/iterator_traits.h>
-#endif // THRUST_VERSION
-
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \addtogroup UtilIterator
- * @{
- */
-
-/**
- * \brief A random-access input generator for dereferencing a sequence of incrementing integer values.
- *
- * \par Overview
- * - After initializing a CountingInputIteratorTto a certain integer \p base, read references
- *   at \p offset will return the value \p base + \p offset.
- * - Can be constructed, manipulated, dereferenced, and exchanged within and between host and device
- *   functions.
- * - Compatible with Thrust API v1.7 or newer.
- *
- * \par Snippet
- * The code snippet below illustrates the use of \p CountingInputIteratorTto
- * dereference a sequence of incrementing integers.
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/iterator/counting_input_iterator.cuh>
- *
- * cub::CountingInputIterator<int> itr(5);
- *
- * printf("%d\n", itr[0]);      // 5
- * printf("%d\n", itr[1]);      // 6
- * printf("%d\n", itr[2]);      // 7
- * printf("%d\n", itr[50]);     // 55
- *
- * \endcode
- *
- * \tparam ValueType            The value type of this iterator
- * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
- */
-template <
-    typename ValueType,
-    typename OffsetT = ptrdiff_t>
-class CountingInputIterator
-{
-public:
-
-    // Required iterator traits
-    typedef CountingInputIterator               self_type;              ///< My own type
-    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
-    typedef ValueType                           value_type;             ///< The type of the element the iterator can point to
-    typedef ValueType*                          pointer;                ///< The type of a pointer to an element the iterator can point to
-    typedef ValueType                           reference;              ///< The type of a reference to an element the iterator can point to
-
-#if (THRUST_VERSION >= 100700)
-    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
-    typedef typename thrust::detail::iterator_facade_category<
-        thrust::any_system_tag,
-        thrust::random_access_traversal_tag,
-        value_type,
-        reference
-      >::type iterator_category;                                        ///< The iterator category
-#else
-    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
-#endif  // THRUST_VERSION
-
-private:
-
-    ValueType val;
-
-public:
-
-    /// Constructor
-    __host__ __device__ __forceinline__ CountingInputIterator(
-        const ValueType &val)          ///< Starting value for the iterator instance to report
-    :
-        val(val)
-    {}
-
-    /// Postfix increment
-    __host__ __device__ __forceinline__ self_type operator++(int)
-    {
-        self_type retval = *this;
-        val++;
-        return retval;
-    }
-
-    /// Prefix increment
-    __host__ __device__ __forceinline__ self_type operator++()
-    {
-        val++;
-        return *this;
-    }
-
-    /// Indirection
-    __host__ __device__ __forceinline__ reference operator*() const
-    {
-        return val;
-    }
-
-    /// Addition
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
-    {
-        self_type retval(val + (ValueType) n);
-        return retval;
-    }
-
-    /// Addition assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
-    {
-        val += (ValueType) n;
-        return *this;
-    }
-
-    /// Subtraction
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
-    {
-        self_type retval(val - (ValueType) n);
-        return retval;
-    }
-
-    /// Subtraction assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
-    {
-        val -= n;
-        return *this;
-    }
-
-    /// Distance
-    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
-    {
-        return (difference_type) (val - other.val);
-    }
-
-    /// Array subscript
-    template <typename Distance>
-    __host__ __device__ __forceinline__ reference operator[](Distance n) const
-    {
-        return val + (ValueType) n;
-    }
-
-    /// Structure dereference
-    __host__ __device__ __forceinline__ pointer operator->()
-    {
-        return &val;
-    }
-
-    /// Equal to
-    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
-    {
-        return (val == rhs.val);
-    }
-
-    /// Not equal to
-    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
-    {
-        return (val != rhs.val);
-    }
-
-    /// ostream operator
-    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
-    {
-        os << "[" << itr.val << "]";
-        return os;
-    }
-
-};
-
-
-
-/** @} */       // end group UtilIterator
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/system/cuda/detail/cub/iterator/discard_output_iterator.cuh b/thrust/system/cuda/detail/cub/iterator/discard_output_iterator.cuh
deleted file mode 100644
index 497b2893a..000000000
--- a/thrust/system/cuda/detail/cub/iterator/discard_output_iterator.cuh
+++ /dev/null
@@ -1,220 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Random-access iterator types
- */
-
-#pragma once
-
-#include <iterator>
-#include <iostream>
-
-#include "../util_namespace.cuh"
-#include "../util_macro.cuh"
-
-#if (THRUST_VERSION >= 100700)
-    // This iterator is compatible with Thrust API 1.7 and newer
-    #include <thrust/iterator/iterator_facade.h>
-    #include <thrust/iterator/iterator_traits.h>
-#endif // THRUST_VERSION
-
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup UtilIterator
- * @{
- */
-
-
-/**
- * \brief A discard iterator
- */
-template <typename OffsetT = ptrdiff_t>
-class DiscardOutputIterator
-{
-public:
-
-    // Required iterator traits
-    typedef DiscardOutputIterator   self_type;              ///< My own type
-    typedef OffsetT                 difference_type;        ///< Type to express the result of subtracting one iterator from another
-    typedef void                    value_type;             ///< The type of the element the iterator can point to
-    typedef void                    pointer;                ///< The type of a pointer to an element the iterator can point to
-    typedef void                    reference;              ///< The type of a reference to an element the iterator can point to
-
-#if (THRUST_VERSION >= 100700)
-    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
-    typedef typename thrust::detail::iterator_facade_category<
-        thrust::any_system_tag,
-        thrust::random_access_traversal_tag,
-        value_type,
-        reference
-      >::type iterator_category;                                        ///< The iterator category
-#else
-    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
-#endif  // THRUST_VERSION
-
-private:
-
-    OffsetT offset;
-
-#if defined(_WIN32) || !defined(_WIN64)
-    // Workaround for win32 parameter-passing bug (ulonglong2 argmin DeviceReduce)
-    OffsetT pad[CUB_MAX(1, (16 / sizeof(OffsetT) - 1))];
-#endif
-
-public:
-
-    /// Constructor
-    __host__ __device__ __forceinline__ DiscardOutputIterator(
-        OffsetT offset = 0)     ///< Base offset
-    :
-        offset(offset)
-    {}
-
-    /// Postfix increment
-    __host__ __device__ __forceinline__ self_type operator++(int)
-    {
-        self_type retval = *this;
-        offset++;
-        return retval;
-    }
-
-    /// Prefix increment
-    __host__ __device__ __forceinline__ self_type operator++()
-    {
-        offset++;
-        return *this;
-    }
-
-    /// Indirection
-    __host__ __device__ __forceinline__ self_type& operator*()
-    {
-        // return self reference, which can be assigned to anything
-        return *this;
-    }
-
-    /// Addition
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
-    {
-        self_type retval(offset + n);
-        return retval;
-    }
-
-    /// Addition assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
-    {
-        offset += n;
-        return *this;
-    }
-
-    /// Subtraction
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
-    {
-        self_type retval(offset - n);
-        return retval;
-    }
-
-    /// Subtraction assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
-    {
-        offset -= n;
-        return *this;
-    }
-
-    /// Distance
-    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
-    {
-        return offset - other.offset;
-    }
-
-    /// Array subscript
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator[](Distance n)
-    {
-        // return self reference, which can be assigned to anything
-        return *this;
-    }
-
-    /// Structure dereference
-    __host__ __device__ __forceinline__ pointer operator->()
-    {
-        return;
-    }
-
-    /// Assignment to self (no-op)
-    __host__ __device__ __forceinline__ void operator=(self_type const& other)
-    {
-        offset = other.offset;
-    }
-
-    /// Assignment to anything else (no-op)
-    template<typename T>
-    __host__ __device__ __forceinline__ void operator=(T const&)
-    {}
-
-    /// Cast to void* operator
-    __host__ __device__ __forceinline__ operator void*() const { return NULL; }
-
-    /// Equal to
-    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
-    {
-        return (offset == rhs.offset);
-    }
-
-    /// Not equal to
-    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
-    {
-        return (offset != rhs.offset);
-    }
-
-    /// ostream operator
-    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
-    {
-        os << "[" << itr.offset << "]";
-        return os;
-    }
-
-};
-
-
-/** @} */       // end group UtilIterator
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/system/cuda/detail/cub/iterator/tex_obj_input_iterator.cuh b/thrust/system/cuda/detail/cub/iterator/tex_obj_input_iterator.cuh
deleted file mode 100644
index 7067ae001..000000000
--- a/thrust/system/cuda/detail/cub/iterator/tex_obj_input_iterator.cuh
+++ /dev/null
@@ -1,310 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Random-access iterator types
- */
-
-#pragma once
-
-#include <iterator>
-#include <iostream>
-
-#include "../thread/thread_load.cuh"
-#include "../thread/thread_store.cuh"
-#include "../util_device.cuh"
-#include "../util_debug.cuh"
-#include "../util_namespace.cuh"
-
-#if (THRUST_VERSION >= 100700)
-    // This iterator is compatible with Thrust API 1.7 and newer
-    #include <thrust/iterator/iterator_facade.h>
-    #include <thrust/iterator/iterator_traits.h>
-#endif // THRUST_VERSION
-
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \addtogroup UtilIterator
- * @{
- */
-
-
-
-/**
- * \brief A random-access input wrapper for dereferencing array values through texture cache.  Uses newer Kepler-style texture objects.
- *
- * \par Overview
- * - TexObjInputIteratorTwraps a native device pointer of type <tt>ValueType*</tt>. References
- *   to elements are to be loaded through texture cache.
- * - Can be used to load any data type from memory through texture cache.
- * - Can be manipulated and exchanged within and between host and device
- *   functions, can only be constructed within host functions, and can only be
- *   dereferenced within device functions.
- * - With regard to nested/dynamic parallelism, TexObjInputIteratorTiterators may only be
- *   created by the host thread, but can be used by any descendant kernel.
- * - Compatible with Thrust API v1.7 or newer.
- *
- * \par Snippet
- * The code snippet below illustrates the use of \p TexRefInputIteratorTto
- * dereference a device array of doubles through texture cache.
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/iterator/tex_obj_input_iterator.cuh>
- *
- * // Declare, allocate, and initialize a device array
- * int num_items;   // e.g., 7
- * double *d_in;    // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0]
- *
- * // Create an iterator wrapper
- * cub::TexObjInputIterator<double> itr;
- * itr.BindTexture(d_in, sizeof(double) * num_items);
- * ...
- *
- * // Within device code:
- * printf("%f\n", itr[0]);      // 8.0
- * printf("%f\n", itr[1]);      // 6.0
- * printf("%f\n", itr[6]);      // 9.0
- *
- * ...
- * itr.UnbindTexture();
- *
- * \endcode
- *
- * \tparam T                    The value type of this iterator
- * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
- */
-template <
-    typename    T,
-    typename    OffsetT = ptrdiff_t>
-class TexObjInputIterator
-{
-public:
-
-    // Required iterator traits
-    typedef TexObjInputIterator                 self_type;              ///< My own type
-    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
-    typedef T                                   value_type;             ///< The type of the element the iterator can point to
-    typedef T*                                  pointer;                ///< The type of a pointer to an element the iterator can point to
-    typedef T                                   reference;              ///< The type of a reference to an element the iterator can point to
-
-#if (THRUST_VERSION >= 100700)
-    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
-    typedef typename thrust::detail::iterator_facade_category<
-        thrust::device_system_tag,
-        thrust::random_access_traversal_tag,
-        value_type,
-        reference
-      >::type iterator_category;                                        ///< The iterator category
-#else
-    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
-#endif  // THRUST_VERSION
-
-private:
-
-    // Largest texture word we can use in device
-    typedef typename UnitWord<T>::TextureWord TextureWord;
-
-    // Number of texture words per T
-    enum {
-        TEXTURE_MULTIPLE = sizeof(T) / sizeof(TextureWord)
-    };
-
-private:
-
-    T*                  ptr;
-    difference_type     tex_offset;
-    cudaTextureObject_t tex_obj;
-
-public:
-
-    /// Constructor
-    __host__ __device__ __forceinline__ TexObjInputIterator()
-    :
-        ptr(NULL),
-        tex_offset(0),
-        tex_obj(0)
-    {}
-
-    /// Use this iterator to bind \p ptr with a texture reference
-    template <typename QualifiedT>
-    cudaError_t BindTexture(
-        QualifiedT      *ptr,               ///< Native pointer to wrap that is aligned to cudaDeviceProp::textureAlignment
-        size_t          bytes = size_t(-1),         ///< Number of bytes in the range
-        size_t          tex_offset = 0)     ///< OffsetT (in items) from \p ptr denoting the position of the iterator
-    {
-        this->ptr = const_cast<typename RemoveQualifiers<QualifiedT>::Type *>(ptr);
-        this->tex_offset = tex_offset;
-
-        cudaChannelFormatDesc   channel_desc = cudaCreateChannelDesc<TextureWord>();
-        cudaResourceDesc        res_desc;
-        cudaTextureDesc         tex_desc;
-        memset(&res_desc, 0, sizeof(cudaResourceDesc));
-        memset(&tex_desc, 0, sizeof(cudaTextureDesc));
-        res_desc.resType                = cudaResourceTypeLinear;
-        res_desc.res.linear.devPtr      = this->ptr;
-        res_desc.res.linear.desc        = channel_desc;
-        res_desc.res.linear.sizeInBytes = bytes;
-        tex_desc.readMode               = cudaReadModeElementType;
-        return cudaCreateTextureObject(&tex_obj, &res_desc, &tex_desc, NULL);
-    }
-
-    /// Unbind this iterator from its texture reference
-    cudaError_t UnbindTexture()
-    {
-        return cudaDestroyTextureObject(tex_obj);
-    }
-
-    /// Postfix increment
-    __host__ __device__ __forceinline__ self_type operator++(int)
-    {
-        self_type retval = *this;
-        tex_offset++;
-        return retval;
-    }
-
-    /// Prefix increment
-    __host__ __device__ __forceinline__ self_type operator++()
-    {
-        tex_offset++;
-        return *this;
-    }
-
-    /// Indirection
-    __host__ __device__ __forceinline__ reference operator*() const
-    {
-#if (CUB_PTX_ARCH == 0)
-        // Simply dereference the pointer on the host
-        return ptr[tex_offset];
-#else
-        // Move array of uninitialized words, then alias and assign to return value
-        TextureWord words[TEXTURE_MULTIPLE];
-
-        #pragma unroll
-        for (int i = 0; i < TEXTURE_MULTIPLE; ++i)
-        {
-            words[i] = tex1Dfetch<TextureWord>(
-                tex_obj,
-                (tex_offset * TEXTURE_MULTIPLE) + i);
-        }
-
-        // Load from words
-        return *reinterpret_cast<T*>(words);
-#endif
-    }
-
-    /// Addition
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
-    {
-        self_type retval;
-        retval.ptr          = ptr;
-        retval.tex_obj      = tex_obj;
-        retval.tex_offset   = tex_offset + n;
-        return retval;
-    }
-
-    /// Addition assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
-    {
-        tex_offset += n;
-        return *this;
-    }
-
-    /// Subtraction
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
-    {
-        self_type retval;
-        retval.ptr          = ptr;
-        retval.tex_obj      = tex_obj;
-        retval.tex_offset   = tex_offset - n;
-        return retval;
-    }
-
-    /// Subtraction assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
-    {
-        tex_offset -= n;
-        return *this;
-    }
-
-    /// Distance
-    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
-    {
-        return tex_offset - other.tex_offset;
-    }
-
-    /// Array subscript
-    template <typename Distance>
-    __host__ __device__ __forceinline__ reference operator[](Distance n) const
-    {
-        self_type offset = (*this) + n;
-        return *offset;
-    }
-
-    /// Structure dereference
-    __host__ __device__ __forceinline__ pointer operator->()
-    {
-        return &(*(*this));
-    }
-
-    /// Equal to
-    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
-    {
-        return ((ptr == rhs.ptr) && (tex_offset == rhs.tex_offset) && (tex_obj == rhs.tex_obj));
-    }
-
-    /// Not equal to
-    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
-    {
-        return ((ptr != rhs.ptr) || (tex_offset != rhs.tex_offset) || (tex_obj != rhs.tex_obj));
-    }
-
-    /// ostream operator
-    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
-    {
-        return os;
-    }
-
-};
-
-
-
-/** @} */       // end group UtilIterator
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/system/cuda/detail/cub/iterator/tex_ref_input_iterator.cuh b/thrust/system/cuda/detail/cub/iterator/tex_ref_input_iterator.cuh
deleted file mode 100644
index 73904b787..000000000
--- a/thrust/system/cuda/detail/cub/iterator/tex_ref_input_iterator.cuh
+++ /dev/null
@@ -1,374 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Random-access iterator types
- */
-
-#pragma once
-
-#include <iterator>
-#include <iostream>
-
-#include "../thread/thread_load.cuh"
-#include "../thread/thread_store.cuh"
-#include "../util_device.cuh"
-#include "../util_debug.cuh"
-#include "../util_namespace.cuh"
-
-#if (CUDA_VERSION >= 5050) || defined(DOXYGEN_ACTIVE)  // This iterator is compatible with CUDA 5.5 and newer
-
-#if (THRUST_VERSION >= 100700)    // This iterator is compatible with Thrust API 1.7 and newer
-    #include <thrust/iterator/iterator_facade.h>
-    #include <thrust/iterator/iterator_traits.h>
-#endif // THRUST_VERSION
-
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Static file-scope Tesla/Fermi-style texture references
- *****************************************************************************/
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-// Anonymous namespace
-namespace {
-
-/// Global texture reference specialized by type
-template <typename T>
-struct IteratorTexRef
-{
-    /// And by unique ID
-    template <int UNIQUE_ID>
-    struct TexId
-    {
-        // Largest texture word we can use in device
-        typedef typename UnitWord<T>::DeviceWord DeviceWord;
-        typedef typename UnitWord<T>::TextureWord TextureWord;
-
-        // Number of texture words per T
-        enum {
-            DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord),
-            TEXTURE_MULTIPLE = sizeof(T) / sizeof(TextureWord)
-        };
-
-        // Texture reference type
-        typedef texture<TextureWord> TexRef;
-
-        // Texture reference
-        static TexRef ref;
-
-        /// Bind texture
-        static cudaError_t BindTexture(void *d_in, size_t &offset)
-        {
-            if (d_in)
-            {
-                cudaChannelFormatDesc tex_desc = cudaCreateChannelDesc<TextureWord>();
-                ref.channelDesc = tex_desc;
-                return (CubDebug(cudaBindTexture(&offset, ref, d_in)));
-            }
-
-            return cudaSuccess;
-        }
-
-        /// Unbind texture
-        static cudaError_t UnbindTexture()
-        {
-            return CubDebug(cudaUnbindTexture(ref));
-        }
-
-        /// Fetch element
-        template <typename Distance>
-        static __device__ __forceinline__ T Fetch(Distance tex_offset)
-        {
-            DeviceWord temp[DEVICE_MULTIPLE];
-            TextureWord *words = reinterpret_cast<TextureWord*>(temp);
-
-            #pragma unroll
-            for (int i = 0; i < TEXTURE_MULTIPLE; ++i)
-            {
-                words[i] = tex1Dfetch(ref, (tex_offset * TEXTURE_MULTIPLE) + i);
-            }
-
-            return reinterpret_cast<T&>(temp);
-        }
-    };
-};
-
-// Texture reference definitions
-template <typename  T>
-template <int       UNIQUE_ID>
-typename IteratorTexRef<T>::template TexId<UNIQUE_ID>::TexRef IteratorTexRef<T>::template TexId<UNIQUE_ID>::ref = 0;
-
-
-} // Anonymous namespace
-
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-
-/**
- * \addtogroup UtilIterator
- * @{
- */
-
-
-
-/**
- * \brief A random-access input wrapper for dereferencing array values through texture cache.  Uses older Tesla/Fermi-style texture references.
- *
- * \par Overview
- * - TexRefInputIteratorTwraps a native device pointer of type <tt>ValueType*</tt>. References
- *   to elements are to be loaded through texture cache.
- * - Can be used to load any data type from memory through texture cache.
- * - Can be manipulated and exchanged within and between host and device
- *   functions, can only be constructed within host functions, and can only be
- *   dereferenced within device functions.
- * - The \p UNIQUE_ID template parameter is used to statically name the underlying texture
- *   reference.  Only one TexRefInputIteratorTinstance can be bound at any given time for a
- *   specific combination of (1) data type \p T, (2) \p UNIQUE_ID, (3) host
- *   thread, and (4) compilation .o unit.
- * - With regard to nested/dynamic parallelism, TexRefInputIteratorTiterators may only be
- *   created by the host thread and used by a top-level kernel (i.e. the one which is launched
- *   from the host).
- * - Compatible with Thrust API v1.7 or newer.
- * - Compatible with CUDA toolkit v5.5 or newer.
- *
- * \par Snippet
- * The code snippet below illustrates the use of \p TexRefInputIteratorTto
- * dereference a device array of doubles through texture cache.
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/iterator/tex_ref_input_iterator.cuh>
- *
- * // Declare, allocate, and initialize a device array
- * int num_items;   // e.g., 7
- * double *d_in;    // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0]
- *
- * // Create an iterator wrapper
- * cub::TexRefInputIterator<double, __LINE__> itr;
- * itr.BindTexture(d_in, sizeof(double) * num_items);
- * ...
- *
- * // Within device code:
- * printf("%f\n", itr[0]);      // 8.0
- * printf("%f\n", itr[1]);      // 6.0
- * printf("%f\n", itr[6]);      // 9.0
- *
- * ...
- * itr.UnbindTexture();
- *
- * \endcode
- *
- * \tparam T                    The value type of this iterator
- * \tparam UNIQUE_ID            A globally-unique identifier (within the compilation unit) to name the underlying texture reference
- * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
- */
-template <
-    typename    T,
-    int         UNIQUE_ID,
-    typename    OffsetT = ptrdiff_t>
-class TexRefInputIterator
-{
-public:
-
-    // Required iterator traits
-    typedef TexRefInputIterator                 self_type;              ///< My own type
-    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
-    typedef T                                   value_type;             ///< The type of the element the iterator can point to
-    typedef T*                                  pointer;                ///< The type of a pointer to an element the iterator can point to
-    typedef T                                   reference;              ///< The type of a reference to an element the iterator can point to
-
-#if (THRUST_VERSION >= 100700)
-    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
-    typedef typename thrust::detail::iterator_facade_category<
-        thrust::device_system_tag,
-        thrust::random_access_traversal_tag,
-        value_type,
-        reference
-      >::type iterator_category;                                        ///< The iterator category
-#else
-    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
-#endif  // THRUST_VERSION
-
-private:
-
-    T*              ptr;
-    difference_type tex_offset;
-
-    // Texture reference wrapper (old Tesla/Fermi-style textures)
-    typedef typename IteratorTexRef<T>::template TexId<UNIQUE_ID> TexId;
-
-public:
-/*
-    /// Constructor
-    __host__ __device__ __forceinline__ TexRefInputIterator()
-    :
-        ptr(NULL),
-        tex_offset(0)
-    {}
-*/
-    /// Use this iterator to bind \p ptr with a texture reference
-    template <typename QualifiedT>
-    cudaError_t BindTexture(
-        QualifiedT      *ptr,                   ///< Native pointer to wrap that is aligned to cudaDeviceProp::textureAlignment
-        size_t          bytes = size_t(-1),     ///< Number of bytes in the range
-        size_t          tex_offset = 0)         ///< OffsetT (in items) from \p ptr denoting the position of the iterator
-    {
-        this->ptr = const_cast<typename RemoveQualifiers<QualifiedT>::Type *>(ptr);
-        size_t offset;
-        cudaError_t retval = TexId::BindTexture(this->ptr + tex_offset, offset);
-        this->tex_offset = (difference_type) (offset / sizeof(QualifiedT));
-        return retval;
-    }
-
-    /// Unbind this iterator from its texture reference
-    cudaError_t UnbindTexture()
-    {
-        return TexId::UnbindTexture();
-    }
-
-    /// Postfix increment
-    __host__ __device__ __forceinline__ self_type operator++(int)
-    {
-        self_type retval = *this;
-        tex_offset++;
-        return retval;
-    }
-
-    /// Prefix increment
-    __host__ __device__ __forceinline__ self_type operator++()
-    {
-        tex_offset++;
-        return *this;
-    }
-
-    /// Indirection
-    __host__ __device__ __forceinline__ reference operator*() const
-    {
-#if (CUB_PTX_ARCH == 0)
-        // Simply dereference the pointer on the host
-        return ptr[tex_offset];
-#else
-        // Use the texture reference
-        return TexId::Fetch(tex_offset);
-#endif
-    }
-
-    /// Addition
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
-    {
-        self_type retval;
-        retval.ptr = ptr;
-        retval.tex_offset = tex_offset + n;
-        return retval;
-    }
-
-    /// Addition assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
-    {
-        tex_offset += n;
-        return *this;
-    }
-
-    /// Subtraction
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
-    {
-        self_type retval;
-        retval.ptr = ptr;
-        retval.tex_offset = tex_offset - n;
-        return retval;
-    }
-
-    /// Subtraction assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
-    {
-        tex_offset -= n;
-        return *this;
-    }
-
-    /// Distance
-    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
-    {
-        return tex_offset - other.tex_offset;
-    }
-
-    /// Array subscript
-    template <typename Distance>
-    __host__ __device__ __forceinline__ reference operator[](Distance n) const
-    {
-        self_type offset = (*this) + n;
-        return *offset;
-    }
-
-    /// Structure dereference
-    __host__ __device__ __forceinline__ pointer operator->()
-    {
-        return &(*(*this));
-    }
-
-    /// Equal to
-    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
-    {
-        return ((ptr == rhs.ptr) && (tex_offset == rhs.tex_offset));
-    }
-
-    /// Not equal to
-    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
-    {
-        return ((ptr != rhs.ptr) || (tex_offset != rhs.tex_offset));
-    }
-
-    /// ostream operator
-    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
-    {
-        return os;
-    }
-
-};
-
-
-
-/** @} */       // end group UtilIterator
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-#endif // CUDA_VERSION
diff --git a/thrust/system/cuda/detail/cub/iterator/transform_input_iterator.cuh b/thrust/system/cuda/detail/cub/iterator/transform_input_iterator.cuh
deleted file mode 100644
index 5ab407b0c..000000000
--- a/thrust/system/cuda/detail/cub/iterator/transform_input_iterator.cuh
+++ /dev/null
@@ -1,252 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Random-access iterator types
- */
-
-#pragma once
-
-#include <iterator>
-#include <iostream>
-
-#include "../thread/thread_load.cuh"
-#include "../thread/thread_store.cuh"
-#include "../util_device.cuh"
-#include "../util_namespace.cuh"
-
-#if (THRUST_VERSION >= 100700)
-    // This iterator is compatible with Thrust API 1.7 and newer
-    #include <thrust/iterator/iterator_facade.h>
-    #include <thrust/iterator/iterator_traits.h>
-#endif // THRUST_VERSION
-
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \addtogroup UtilIterator
- * @{
- */
-
-
-/**
- * \brief A random-access input wrapper for transforming dereferenced values.
- *
- * \par Overview
- * - TransformInputIteratorTwraps a unary conversion functor of type \p
- *   ConversionOp and a random-access input iterator of type <tt>InputIteratorT</tt>,
- *   using the former to produce references of type \p ValueType from the latter.
- * - Can be used with any data type.
- * - Can be constructed, manipulated, and exchanged within and between host and device
- *   functions.  Wrapped host memory can only be dereferenced on the host, and wrapped
- *   device memory can only be dereferenced on the device.
- * - Compatible with Thrust API v1.7 or newer.
- *
- * \par Snippet
- * The code snippet below illustrates the use of \p TransformInputIteratorTto
- * dereference an array of integers, tripling the values and converting them to doubles.
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/iterator/transform_input_iterator.cuh>
- *
- * // Functor for tripling integer values and converting to doubles
- * struct TripleDoubler
- * {
- *     __host__ __device__ __forceinline__
- *     double operator()(const int &a) const {
- *         return double(a * 3);
- *     }
- * };
- *
- * // Declare, allocate, and initialize a device array
- * int *d_in;                   // e.g., [8, 6, 7, 5, 3, 0, 9]
- * TripleDoubler conversion_op;
- *
- * // Create an iterator wrapper
- * cub::TransformInputIterator<double, TripleDoubler, int*> itr(d_in, conversion_op);
- *
- * // Within device code:
- * printf("%f\n", itr[0]);  // 24.0
- * printf("%f\n", itr[1]);  // 18.0
- * printf("%f\n", itr[6]);  // 27.0
- *
- * \endcode
- *
- * \tparam ValueType            The value type of this iterator
- * \tparam ConversionOp         Unary functor type for mapping objects of type \p InputType to type \p ValueType.  Must have member <tt>ValueType operator()(const InputType &datum)</tt>.
- * \tparam InputIteratorT       The type of the wrapped input iterator
- * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
- *
- */
-template <
-    typename ValueType,
-    typename ConversionOp,
-    typename InputIteratorT,
-    typename OffsetT = ptrdiff_t>
-class TransformInputIterator
-{
-public:
-
-    // Required iterator traits
-    typedef TransformInputIterator              self_type;              ///< My own type
-    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
-    typedef ValueType                           value_type;             ///< The type of the element the iterator can point to
-    typedef ValueType*                          pointer;                ///< The type of a pointer to an element the iterator can point to
-    typedef ValueType                           reference;              ///< The type of a reference to an element the iterator can point to
-
-#if (THRUST_VERSION >= 100700)
-    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
-    typedef typename thrust::detail::iterator_facade_category<
-        thrust::any_system_tag,
-        thrust::random_access_traversal_tag,
-        value_type,
-        reference
-      >::type iterator_category;                                        ///< The iterator category
-#else
-    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
-#endif  // THRUST_VERSION
-
-private:
-
-    ConversionOp    conversion_op;
-    InputIteratorT  input_itr;
-
-public:
-
-    /// Constructor
-    __host__ __device__ __forceinline__ TransformInputIterator(
-        InputIteratorT      input_itr,          ///< Input iterator to wrap
-        ConversionOp        conversion_op)      ///< Conversion functor to wrap
-    :
-        conversion_op(conversion_op),
-        input_itr(input_itr)
-    {}
-
-    /// Postfix increment
-    __host__ __device__ __forceinline__ self_type operator++(int)
-    {
-        self_type retval = *this;
-        input_itr++;
-        return retval;
-    }
-
-    /// Prefix increment
-    __host__ __device__ __forceinline__ self_type operator++()
-    {
-        input_itr++;
-        return *this;
-    }
-
-    /// Indirection
-    __host__ __device__ __forceinline__ reference operator*() const
-    {
-        return conversion_op(*input_itr);
-    }
-
-    /// Addition
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
-    {
-        self_type retval(input_itr + n, conversion_op);
-        return retval;
-    }
-
-    /// Addition assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
-    {
-        input_itr += n;
-        return *this;
-    }
-
-    /// Subtraction
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
-    {
-        self_type retval(input_itr - n, conversion_op);
-        return retval;
-    }
-
-    /// Subtraction assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
-    {
-        input_itr -= n;
-        return *this;
-    }
-
-    /// Distance
-    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
-    {
-        return input_itr - other.input_itr;
-    }
-
-    /// Array subscript
-    template <typename Distance>
-    __host__ __device__ __forceinline__ reference operator[](Distance n) const
-    {
-        return conversion_op(input_itr[n]);
-    }
-
-    /// Structure dereference
-    __host__ __device__ __forceinline__ pointer operator->()
-    {
-        return &conversion_op(*input_itr);
-    }
-
-    /// Equal to
-    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
-    {
-        return (input_itr == rhs.input_itr);
-    }
-
-    /// Not equal to
-    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
-    {
-        return (input_itr != rhs.input_itr);
-    }
-
-    /// ostream operator
-    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
-    {
-        return os;
-    }
-};
-
-
-
-/** @} */       // end group UtilIterator
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/system/cuda/detail/cub/thread/thread_load.cuh b/thrust/system/cuda/detail/cub/thread/thread_load.cuh
deleted file mode 100644
index 26f419f2d..000000000
--- a/thrust/system/cuda/detail/cub/thread/thread_load.cuh
+++ /dev/null
@@ -1,438 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Thread utilities for reading memory using PTX cache modifiers.
- */
-
-#pragma once
-
-//#include <cuda.h>
-
-#include <iterator>
-
-#include "../util_ptx.cuh"
-#include "../util_type.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \addtogroup UtilIo
- * @{
- */
-
-//-----------------------------------------------------------------------------
-// Tags and constants
-//-----------------------------------------------------------------------------
-
-/**
- * \brief Enumeration of cache modifiers for memory load operations.
- */
-enum CacheLoadModifier
-{
-    LOAD_DEFAULT,       ///< Default (no modifier)
-    LOAD_CA,            ///< Cache at all levels
-    LOAD_CG,            ///< Cache at global level
-    LOAD_CS,            ///< Cache streaming (likely to be accessed once)
-    LOAD_CV,            ///< Cache as volatile (including cached system lines)
-    LOAD_LDG,           ///< Cache as texture
-    LOAD_VOLATILE,      ///< Volatile (any memory space)
-};
-
-
-/**
- * \name Thread I/O (cache modified)
- * @{
- */
-
-/**
- * \brief Thread utility for reading memory using cub::CacheLoadModifier cache modifiers.  Can be used to load any data type.
- *
- * \par Example
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/thread/thread_load.cuh>
- *
- * // 32-bit load using cache-global modifier:
- * int *d_in;
- * int val = cub::ThreadLoad<cub::LOAD_CA>(d_in + threadIdx.x);
- *
- * // 16-bit load using default modifier
- * short *d_in;
- * short val = cub::ThreadLoad<cub::LOAD_DEFAULT>(d_in + threadIdx.x);
- *
- * // 256-bit load using cache-volatile modifier
- * double4 *d_in;
- * double4 val = cub::ThreadLoad<cub::LOAD_CV>(d_in + threadIdx.x);
- *
- * // 96-bit load using cache-streaming modifier
- * struct TestFoo { bool a; short b; };
- * TestFoo *d_struct;
- * TestFoo val = cub::ThreadLoad<cub::LOAD_CS>(d_in + threadIdx.x);
- * \endcode
- *
- * \tparam MODIFIER             <b>[inferred]</b> CacheLoadModifier enumeration
- * \tparam InputIteratorT       <b>[inferred]</b> Input iterator type \iterator
- */
-template <
-    CacheLoadModifier MODIFIER,
-    typename InputIteratorT>
-__device__ __forceinline__ typename std::iterator_traits<InputIteratorT>::value_type ThreadLoad(InputIteratorT itr);
-
-
-//@}  end member group
-
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-
-/// Helper structure for templated load iteration (inductive case)
-template <int COUNT, int MAX>
-struct IterateThreadLoad
-{
-    template <CacheLoadModifier MODIFIER, typename T>
-    static __device__ __forceinline__ void Load(T const *ptr, T *vals)
-    {
-        vals[COUNT] = ThreadLoad<MODIFIER>(ptr + COUNT);
-        IterateThreadLoad<COUNT + 1, MAX>::template Load<MODIFIER>(ptr, vals);
-    }
-
-    template <typename InputIteratorT, typename T>
-    static __device__ __forceinline__ void Dereference(InputIteratorT itr, T *vals)
-    {
-        vals[COUNT] = itr[COUNT];
-        IterateThreadLoad<COUNT + 1, MAX>::Dereference(itr, vals);
-    }
-};
-
-
-/// Helper structure for templated load iteration (termination case)
-template <int MAX>
-struct IterateThreadLoad<MAX, MAX>
-{
-    template <CacheLoadModifier MODIFIER, typename T>
-    static __device__ __forceinline__ void Load(T const * /*ptr*/, T * /*vals*/) {}
-
-    template <typename InputIteratorT, typename T>
-    static __device__ __forceinline__ void Dereference(InputIteratorT /*itr*/, T * /*vals*/) {}
-};
-
-
-/**
- * Define a uint4 (16B) ThreadLoad specialization for the given Cache load modifier
- */
-#define _CUB_LOAD_16(cub_modifier, ptx_modifier)                                             \
-    template<>                                                                              \
-    __device__ __forceinline__ uint4 ThreadLoad<cub_modifier, uint4 const *>(uint4 const *ptr)                   \
-    {                                                                                       \
-        uint4 retval;                                                                       \
-        asm volatile ("ld."#ptx_modifier".v4.u32 {%0, %1, %2, %3}, [%4];" :                 \
-            "=r"(retval.x),                                                                 \
-            "=r"(retval.y),                                                                 \
-            "=r"(retval.z),                                                                 \
-            "=r"(retval.w) :                                                                \
-            _CUB_ASM_PTR_(ptr));                                                            \
-        return retval;                                                                      \
-    }                                                                                       \
-    template<>                                                                              \
-    __device__ __forceinline__ ulonglong2 ThreadLoad<cub_modifier, ulonglong2 const *>(ulonglong2 const *ptr)    \
-    {                                                                                       \
-        ulonglong2 retval;                                                                  \
-        asm volatile ("ld."#ptx_modifier".v2.u64 {%0, %1}, [%2];" :                         \
-            "=l"(retval.x),                                                                 \
-            "=l"(retval.y) :                                                                \
-            _CUB_ASM_PTR_(ptr));                                                            \
-        return retval;                                                                      \
-    }
-
-/**
- * Define a uint2 (8B) ThreadLoad specialization for the given Cache load modifier
- */
-#define _CUB_LOAD_8(cub_modifier, ptx_modifier)                                              \
-    template<>                                                                              \
-    __device__ __forceinline__ ushort4 ThreadLoad<cub_modifier, ushort4 const *>(ushort4 const *ptr)             \
-    {                                                                                       \
-        ushort4 retval;                                                                     \
-        asm volatile ("ld."#ptx_modifier".v4.u16 {%0, %1, %2, %3}, [%4];" :                 \
-            "=h"(retval.x),                                                                 \
-            "=h"(retval.y),                                                                 \
-            "=h"(retval.z),                                                                 \
-            "=h"(retval.w) :                                                                \
-            _CUB_ASM_PTR_(ptr));                                                            \
-        return retval;                                                                      \
-    }                                                                                       \
-    template<>                                                                              \
-    __device__ __forceinline__ uint2 ThreadLoad<cub_modifier, uint2 const *>(uint2 const *ptr)                   \
-    {                                                                                       \
-        uint2 retval;                                                                       \
-        asm volatile ("ld."#ptx_modifier".v2.u32 {%0, %1}, [%2];" :                         \
-            "=r"(retval.x),                                                                 \
-            "=r"(retval.y) :                                                                \
-            _CUB_ASM_PTR_(ptr));                                                            \
-        return retval;                                                                      \
-    }                                                                                       \
-    template<>                                                                              \
-    __device__ __forceinline__ unsigned long long ThreadLoad<cub_modifier, unsigned long long const *>(unsigned long long const *ptr)    \
-    {                                                                                       \
-        unsigned long long retval;                                                          \
-        asm volatile ("ld."#ptx_modifier".u64 %0, [%1];" :                                  \
-            "=l"(retval) :                                                                  \
-            _CUB_ASM_PTR_(ptr));                                                            \
-        return retval;                                                                      \
-    }
-
-/**
- * Define a uint (4B) ThreadLoad specialization for the given Cache load modifier
- */
-#define _CUB_LOAD_4(cub_modifier, ptx_modifier)                                              \
-    template<>                                                                              \
-    __device__ __forceinline__ unsigned int ThreadLoad<cub_modifier, unsigned int const *>(unsigned int const *ptr)                      \
-    {                                                                                       \
-        unsigned int retval;                                                                \
-        asm volatile ("ld."#ptx_modifier".u32 %0, [%1];" :                                  \
-            "=r"(retval) :                                                                  \
-            _CUB_ASM_PTR_(ptr));                                                            \
-        return retval;                                                                      \
-    }
-
-
-/**
- * Define a unsigned short (2B) ThreadLoad specialization for the given Cache load modifier
- */
-#define _CUB_LOAD_2(cub_modifier, ptx_modifier)                                              \
-    template<>                                                                              \
-    __device__ __forceinline__ unsigned short ThreadLoad<cub_modifier, unsigned short const *>(unsigned short const *ptr)                \
-    {                                                                                       \
-        unsigned short retval;                                                              \
-        asm volatile ("ld."#ptx_modifier".u16 %0, [%1];" :                                  \
-            "=h"(retval) :                                                                  \
-            _CUB_ASM_PTR_(ptr));                                                            \
-        return retval;                                                                      \
-    }
-
-
-/**
- * Define an unsigned char (1B) ThreadLoad specialization for the given Cache load modifier
- */
-#define _CUB_LOAD_1(cub_modifier, ptx_modifier)                                              \
-    template<>                                                                              \
-    __device__ __forceinline__ unsigned char ThreadLoad<cub_modifier, unsigned char const *>(unsigned char const *ptr)                   \
-    {                                                                                       \
-        unsigned short retval;                                                              \
-        asm volatile (                                                                      \
-        "{"                                                                                 \
-        "   .reg .u8 datum;"                                                                \
-        "    ld."#ptx_modifier".u8 datum, [%1];"                                            \
-        "    cvt.u16.u8 %0, datum;"                                                         \
-        "}" :                                                                               \
-            "=h"(retval) :                                                                  \
-            _CUB_ASM_PTR_(ptr));                                                            \
-        return (unsigned char) retval;                                                      \
-    }
-
-
-/**
- * Define powers-of-two ThreadLoad specializations for the given Cache load modifier
- */
-#define _CUB_LOAD_ALL(cub_modifier, ptx_modifier)                                            \
-    _CUB_LOAD_16(cub_modifier, ptx_modifier)                                                 \
-    _CUB_LOAD_8(cub_modifier, ptx_modifier)                                                  \
-    _CUB_LOAD_4(cub_modifier, ptx_modifier)                                                  \
-    _CUB_LOAD_2(cub_modifier, ptx_modifier)                                                  \
-    _CUB_LOAD_1(cub_modifier, ptx_modifier)                                                  \
-
-
-/**
- * Define powers-of-two ThreadLoad specializations for the various Cache load modifiers
- */
-#if CUB_PTX_ARCH >= 200
-    _CUB_LOAD_ALL(LOAD_CA, ca)
-    _CUB_LOAD_ALL(LOAD_CG, cg)
-    _CUB_LOAD_ALL(LOAD_CS, cs)
-    _CUB_LOAD_ALL(LOAD_CV, cv)
-#else
-    _CUB_LOAD_ALL(LOAD_CA, global)
-    // Use volatile to ensure coherent reads when this PTX is JIT'd to run on newer architectures with L1
-    _CUB_LOAD_ALL(LOAD_CG, volatile.global)
-    _CUB_LOAD_ALL(LOAD_CS, global)
-    _CUB_LOAD_ALL(LOAD_CV, volatile.global)
-#endif
-
-#if CUB_PTX_ARCH >= 350
-    _CUB_LOAD_ALL(LOAD_LDG, global.nc)
-#else
-    _CUB_LOAD_ALL(LOAD_LDG, global)
-#endif
-
-
-// Macro cleanup
-#undef _CUB_LOAD_ALL
-#undef _CUB_LOAD_1
-#undef _CUB_LOAD_2
-#undef _CUB_LOAD_4
-#undef _CUB_LOAD_8
-#undef _CUB_LOAD_16
-
-
-
-/**
- * ThreadLoad definition for LOAD_DEFAULT modifier on iterator types
- */
-template <typename InputIteratorT>
-__device__ __forceinline__ typename std::iterator_traits<InputIteratorT>::value_type ThreadLoad(
-    InputIteratorT          itr,
-    Int2Type<LOAD_DEFAULT>  /*modifier*/,
-    Int2Type<false>         /*is_pointer*/)
-{
-    return *itr;
-}
-
-
-/**
- * ThreadLoad definition for LOAD_DEFAULT modifier on pointer types
- */
-template <typename T>
-__device__ __forceinline__ T ThreadLoad(
-    T                       *ptr,
-    Int2Type<LOAD_DEFAULT>  /*modifier*/,
-    Int2Type<true>          /*is_pointer*/)
-{
-    return *ptr;
-}
-
-
-/**
- * ThreadLoad definition for LOAD_VOLATILE modifier on primitive pointer types
- */
-template <typename T>
-__device__ __forceinline__ T ThreadLoadVolatilePointer(
-    T                       *ptr,
-    Int2Type<true>          /*is_primitive*/)
-{
-    T retval = *reinterpret_cast<volatile T*>(ptr);
-    return retval;
-}
-
-
-/**
- * ThreadLoad definition for LOAD_VOLATILE modifier on non-primitive pointer types
- */
-template <typename T>
-__device__ __forceinline__ T ThreadLoadVolatilePointer(
-    T                       *ptr,
-    Int2Type<false>         /*is_primitive*/)
-{
-    typedef typename UnitWord<T>::VolatileWord VolatileWord;   // Word type for memcopying
-
-    const int VOLATILE_MULTIPLE = sizeof(T) / sizeof(VolatileWord);
-/*
-    VolatileWord words[VOLATILE_MULTIPLE];
-
-    IterateThreadLoad<0, VOLATILE_MULTIPLE>::Dereference(
-        reinterpret_cast<volatile VolatileWord*>(ptr),
-        words);
-
-    return *reinterpret_cast<T*>(words);
-*/
-
-    T retval;
-    VolatileWord *words = reinterpret_cast<VolatileWord*>(&retval);
-    IterateThreadLoad<0, VOLATILE_MULTIPLE>::Dereference(
-        reinterpret_cast<volatile VolatileWord*>(ptr),
-        words);
-    return retval;
-}
-
-
-/**
- * ThreadLoad definition for LOAD_VOLATILE modifier on pointer types
- */
-template <typename T>
-__device__ __forceinline__ T ThreadLoad(
-    T                       *ptr,
-    Int2Type<LOAD_VOLATILE> /*modifier*/,
-    Int2Type<true>          /*is_pointer*/)
-{
-    // Apply tags for partial-specialization
-    return ThreadLoadVolatilePointer(ptr, Int2Type<Traits<T>::PRIMITIVE>());
-}
-
-
-/**
- * ThreadLoad definition for generic modifiers on pointer types
- */
-template <typename T, int MODIFIER>
-__device__ __forceinline__ T ThreadLoad(
-    T const                 *ptr,
-    Int2Type<MODIFIER>      /*modifier*/,
-    Int2Type<true>          /*is_pointer*/)
-{
-    typedef typename UnitWord<T>::DeviceWord DeviceWord;
-
-    const int DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord);
-
-    DeviceWord words[DEVICE_MULTIPLE];
-
-    IterateThreadLoad<0, DEVICE_MULTIPLE>::template Load<CacheLoadModifier(MODIFIER)>(
-        reinterpret_cast<DeviceWord*>(const_cast<T*>(ptr)),
-        words);
-
-    return *reinterpret_cast<T*>(words);
-}
-
-
-/**
- * ThreadLoad definition for generic modifiers
- */
-template <
-    CacheLoadModifier MODIFIER,
-    typename InputIteratorT>
-__device__ __forceinline__ typename std::iterator_traits<InputIteratorT>::value_type ThreadLoad(InputIteratorT itr)
-{
-    // Apply tags for partial-specialization
-    return ThreadLoad(
-        itr,
-        Int2Type<MODIFIER>(),
-        Int2Type<IsPointer<InputIteratorT>::VALUE>());
-}
-
-
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-/** @} */       // end group UtilIo
-
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/system/cuda/detail/cub/thread/thread_operators.cuh b/thrust/system/cuda/detail/cub/thread/thread_operators.cuh
deleted file mode 100644
index 5bfa790e2..000000000
--- a/thrust/system/cuda/detail/cub/thread/thread_operators.cuh
+++ /dev/null
@@ -1,317 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Simple binary operator functor types
- */
-
-/******************************************************************************
- * Simple functor operators
- ******************************************************************************/
-
-#pragma once
-
-#include "../util_macro.cuh"
-#include "../util_type.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup UtilModule
- * @{
- */
-
-/**
- * \brief Default equality functor
- */
-struct Equality
-{
-    /// Boolean equality operator, returns <tt>(a == b)</tt>
-    template <typename T>
-    __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b) const
-    {
-        return a == b;
-    }
-};
-
-
-/**
- * \brief Default inequality functor
- */
-struct Inequality
-{
-    /// Boolean inequality operator, returns <tt>(a != b)</tt>
-    template <typename T>
-    __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b) const
-    {
-        return a != b;
-    }
-};
-
-
-/**
- * \brief Inequality functor (wraps equality functor)
- */
-template <typename EqualityOp>
-struct InequalityWrapper
-{
-    /// Wrapped equality operator
-    EqualityOp op;
-
-    /// Constructor
-    __host__ __device__ __forceinline__
-    InequalityWrapper(EqualityOp op) : op(op) {}
-
-    /// Boolean inequality operator, returns <tt>(a != b)</tt>
-    template <typename T>
-    __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b)
-    {
-        return !op(a, b);
-    }
-};
-
-
-/**
- * \brief Default sum functor
- */
-struct Sum
-{
-    /// Boolean sum operator, returns <tt>a + b</tt>
-    template <typename T>
-    __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
-    {
-        return a + b;
-    }
-};
-
-
-/**
- * \brief Default max functor
- */
-struct Max
-{
-    /// Boolean max operator, returns <tt>(a > b) ? a : b</tt>
-    template <typename T>
-    __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
-    {
-        return CUB_MAX(a, b);
-    }
-};
-
-
-/**
- * \brief Arg max functor (keeps the value and offset of the first occurrence of the larger item)
- */
-struct ArgMax
-{
-    /// Boolean max operator, preferring the item having the smaller offset in case of ties
-    template <typename T, typename OffsetT>
-    __host__ __device__ __forceinline__ KeyValuePair<OffsetT, T> operator()(
-        const KeyValuePair<OffsetT, T> &a,
-        const KeyValuePair<OffsetT, T> &b) const
-    {
-// Mooch BUG (device reduce argmax gk110 3.2 million random fp32)
-//        return ((b.value > a.value) || ((a.value == b.value) && (b.key < a.key))) ? b : a;
-
-        if ((b.value > a.value) || ((a.value == b.value) && (b.key < a.key)))
-            return b;
-        return a;
-    }
-};
-
-
-/**
- * \brief Default min functor
- */
-struct Min
-{
-    /// Boolean min operator, returns <tt>(a < b) ? a : b</tt>
-    template <typename T>
-    __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
-    {
-        return CUB_MIN(a, b);
-    }
-};
-
-
-/**
- * \brief Arg min functor (keeps the value and offset of the first occurrence of the smallest item)
- */
-struct ArgMin
-{
-    /// Boolean min operator, preferring the item having the smaller offset in case of ties
-    template <typename T, typename OffsetT>
-    __host__ __device__ __forceinline__ KeyValuePair<OffsetT, T> operator()(
-        const KeyValuePair<OffsetT, T> &a,
-        const KeyValuePair<OffsetT, T> &b) const
-    {
-// Mooch BUG (device reduce argmax gk110 3.2 million random fp32)
-//        return ((b.value < a.value) || ((a.value == b.value) && (b.key < a.key))) ? b : a;
-
-        if ((b.value < a.value) || ((a.value == b.value) && (b.key < a.key)))
-            return b;
-        return a;
-    }
-};
-
-
-/**
- * \brief Default cast functor
- */
-template <typename B>
-struct CastOp
-{
-    /// Cast operator, returns <tt>(B) a</tt>
-    template <typename A>
-    __host__ __device__ __forceinline__ B operator()(const A &a) const
-    {
-        return (B) a;
-    }
-};
-
-
-/**
- * \brief Binary operator wrapper for switching non-commutative scan arguments
- */
-template <typename ScanOp>
-class SwizzleScanOp
-{
-private:
-
-    /// Wrapped scan operator
-    ScanOp scan_op;
-
-public:
-
-    /// Constructor
-    __host__ __device__ __forceinline__
-    SwizzleScanOp(ScanOp scan_op) : scan_op(scan_op) {}
-
-    /// Switch the scan arguments
-    template <typename T>
-    __host__ __device__ __forceinline__
-    T operator()(const T &a, const T &b)
-    {
-      T _a(a);
-      T _b(b);
-
-      return scan_op(_b, _a);
-    }
-};
-
-
-/**
- * \brief Reduce-by-segment functor.
- *
- * Given two cub::KeyValuePair inputs \p a and \p b and a
- * binary associative combining operator \p <tt>f(const T &x, const T &y)</tt>,
- * an instance of this functor returns a cub::KeyValuePair whose \p key
- * field is <tt>a.key</tt> + <tt>b.key</tt>, and whose \p value field
- * is either b.value if b.key is non-zero, or f(a.value, b.value) otherwise.
- *
- * ReduceBySegmentOp is an associative, non-commutative binary combining operator
- * for input sequences of cub::KeyValuePair pairings.  Such
- * sequences are typically used to represent a segmented set of values to be reduced
- * and a corresponding set of {0,1}-valued integer "head flags" demarcating the
- * first value of each segment.
- *
- */
-template <typename ReductionOpT>    ///< Binary reduction operator to apply to values
-struct ReduceBySegmentOp
-{
-    /// Wrapped reduction operator
-    ReductionOpT op;
-
-    /// Constructor
-    __host__ __device__ __forceinline__ ReduceBySegmentOp() {}
-
-    /// Constructor
-    __host__ __device__ __forceinline__ ReduceBySegmentOp(ReductionOpT op) : op(op) {}
-
-    /// Scan operator
-    template <typename KeyValuePairT>       ///< KeyValuePair pairing of T (value) and OffsetT (head flag)
-    __host__ __device__ __forceinline__ KeyValuePairT operator()(
-        const KeyValuePairT &first,         ///< First partial reduction
-        const KeyValuePairT &second)        ///< Second partial reduction
-    {
-        KeyValuePairT retval;
-        retval.key = first.key + second.key;
-        retval.value = (second.key) ?
-                second.value :                          // The second partial reduction spans a segment reset, so it's value aggregate becomes the running aggregate
-                op(first.value, second.value);          // The second partial reduction does not span a reset, so accumulate both into the running aggregate
-        return retval;
-    }
-};
-
-
-
-template <typename ReductionOpT>    ///< Binary reduction operator to apply to values
-struct ReduceByKeyOp
-{
-    /// Wrapped reduction operator
-    ReductionOpT op;
-
-    /// Constructor
-    __host__ __device__ __forceinline__ ReduceByKeyOp() {}
-
-    /// Constructor
-    __host__ __device__ __forceinline__ ReduceByKeyOp(ReductionOpT op) : op(op) {}
-
-    /// Scan operator
-    template <typename KeyValuePairT>
-    __host__ __device__ __forceinline__ KeyValuePairT operator()(
-        const KeyValuePairT &first,       ///< First partial reduction
-        const KeyValuePairT &second)      ///< Second partial reduction
-    {
-        KeyValuePairT retval = second;
-
-        if (first.key == second.key)
-            retval.value = op(first.value, retval.value);
-
-        return retval;
-    }
-};
-
-
-
-
-
-
-
-/** @} */       // end group UtilModule
-
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/system/cuda/detail/cub/thread/thread_reduce.cuh b/thrust/system/cuda/detail/cub/thread/thread_reduce.cuh
deleted file mode 100644
index 7e525ea0c..000000000
--- a/thrust/system/cuda/detail/cub/thread/thread_reduce.cuh
+++ /dev/null
@@ -1,152 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Thread utilities for sequential reduction over statically-sized array types
- */
-
-#pragma once
-
-#include "../thread/thread_operators.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/// Internal namespace (to prevent ADL mishaps between static functions when mixing different CUB installations)
-namespace internal {
-
-/**
- * Sequential reduction over statically-sized array types
- */
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ReductionOp>
-__device__ __forceinline__ T ThreadReduce(
-    T*                  input,                  ///< [in] Input array
-    ReductionOp         reduction_op,           ///< [in] Binary reduction operator
-    T                   prefix,                 ///< [in] Prefix to seed reduction with
-    Int2Type<LENGTH>    /*length*/)
-{
-    T retval = prefix;
-
-    #pragma unroll
-    for (int i = 0; i < LENGTH; ++i)
-        retval = reduction_op(retval, input[i]);
-
-    return retval;
-}
-
-
-/**
- * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array, seeded with the specified \p prefix.  The aggregate is returned.
- *
- * \tparam LENGTH     LengthT of input array
- * \tparam T          <b>[inferred]</b> The data type to be reduced.
- * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
- */
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ReductionOp>
-__device__ __forceinline__ T ThreadReduce(
-    T*          input,                  ///< [in] Input array
-    ReductionOp reduction_op,           ///< [in] Binary reduction operator
-    T           prefix)                 ///< [in] Prefix to seed reduction with
-{
-    return ThreadReduce(input, reduction_op, prefix, Int2Type<LENGTH>());
-}
-
-
-/**
- * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array.  The aggregate is returned.
- *
- * \tparam LENGTH     LengthT of input array
- * \tparam T          <b>[inferred]</b> The data type to be reduced.
- * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
- */
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ReductionOp>
-__device__ __forceinline__ T ThreadReduce(
-    T*          input,                  ///< [in] Input array
-    ReductionOp reduction_op)           ///< [in] Binary reduction operator
-{
-    T prefix = input[0];
-    return ThreadReduce<LENGTH - 1>(input + 1, reduction_op, prefix);
-}
-
-
-/**
- * \brief Perform a sequential reduction over the statically-sized \p input array, seeded with the specified \p prefix.  The aggregate is returned.
- *
- * \tparam LENGTH     <b>[inferred]</b> LengthT of \p input array
- * \tparam T          <b>[inferred]</b> The data type to be reduced.
- * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
- */
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ReductionOp>
-__device__ __forceinline__ T ThreadReduce(
-    T           (&input)[LENGTH],       ///< [in] Input array
-    ReductionOp reduction_op,           ///< [in] Binary reduction operator
-    T           prefix)                 ///< [in] Prefix to seed reduction with
-{
-    return ThreadReduce(input, reduction_op, prefix, Int2Type<LENGTH>());
-}
-
-
-/**
- * \brief Serial reduction with the specified operator
- *
- * \tparam LENGTH     <b>[inferred]</b> LengthT of \p input array
- * \tparam T          <b>[inferred]</b> The data type to be reduced.
- * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
- */
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ReductionOp>
-__device__ __forceinline__ T ThreadReduce(
-    T           (&input)[LENGTH],       ///< [in] Input array
-    ReductionOp reduction_op)           ///< [in] Binary reduction operator
-{
-    return ThreadReduce<LENGTH>((T*) input, reduction_op);
-}
-
-
-}               // internal namespace
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/system/cuda/detail/cub/thread/thread_scan.cuh b/thrust/system/cuda/detail/cub/thread/thread_scan.cuh
deleted file mode 100644
index 94f3016f4..000000000
--- a/thrust/system/cuda/detail/cub/thread/thread_scan.cuh
+++ /dev/null
@@ -1,268 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Thread utilities for sequential prefix scan over statically-sized array types
- */
-
-#pragma once
-
-#include "../thread/thread_operators.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/// Internal namespace (to prevent ADL mishaps between static functions when mixing different CUB installations)
-namespace internal {
-
-
-/**
- * \addtogroup UtilModule
- * @{
- */
-
-/**
- * \name Sequential prefix scan over statically-sized array types
- * @{
- */
-
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ScanOp>
-__device__ __forceinline__ T ThreadScanExclusive(
-    T                   inclusive,
-    T                   exclusive,
-    T                   *input,                 ///< [in] Input array
-    T                   *output,                ///< [out] Output array (may be aliased to \p input)
-    ScanOp              scan_op,                ///< [in] Binary scan operator
-    Int2Type<LENGTH>    /*length*/)
-{
-    #pragma unroll
-    for (int i = 0; i < LENGTH; ++i)
-    {
-        inclusive = scan_op(exclusive, input[i]);
-        output[i] = exclusive;
-        exclusive = inclusive;
-    }
-
-    return inclusive;
-}
-
-
-
-/**
- * \brief Perform a sequential exclusive prefix scan over \p LENGTH elements of the \p input array, seeded with the specified \p prefix.  The aggregate is returned.
- *
- * \tparam LENGTH     LengthT of \p input and \p output arrays
- * \tparam T          <b>[inferred]</b> The data type to be scanned.
- * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
- */
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ScanOp>
-__device__ __forceinline__ T ThreadScanExclusive(
-    T           *input,                 ///< [in] Input array
-    T           *output,                ///< [out] Output array (may be aliased to \p input)
-    ScanOp      scan_op,                ///< [in] Binary scan operator
-    T           prefix,                 ///< [in] Prefix to seed scan with
-    bool        apply_prefix = true)    ///< [in] Whether or not the calling thread should apply its prefix.  If not, the first output element is undefined.  (Handy for preventing thread-0 from applying a prefix.)
-{
-    T inclusive = input[0];
-    if (apply_prefix)
-    {
-        inclusive = scan_op(prefix, inclusive);
-    }
-    output[0] = prefix;
-    T exclusive = inclusive;
-
-    return ThreadScanExclusive(inclusive, exclusive, input + 1, output + 1, scan_op, Int2Type<LENGTH - 1>());
-}
-
-
-/**
- * \brief Perform a sequential exclusive prefix scan over the statically-sized \p input array, seeded with the specified \p prefix.  The aggregate is returned.
- *
- * \tparam LENGTH     <b>[inferred]</b> LengthT of \p input and \p output arrays
- * \tparam T          <b>[inferred]</b> The data type to be scanned.
- * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
- */
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ScanOp>
-__device__ __forceinline__ T ThreadScanExclusive(
-    T           (&input)[LENGTH],       ///< [in] Input array
-    T           (&output)[LENGTH],      ///< [out] Output array (may be aliased to \p input)
-    ScanOp      scan_op,                ///< [in] Binary scan operator
-    T           prefix,                 ///< [in] Prefix to seed scan with
-    bool        apply_prefix = true)    ///< [in] Whether or not the calling thread should apply its prefix.  (Handy for preventing thread-0 from applying a prefix.)
-{
-    return ThreadScanExclusive<LENGTH>((T*) input, (T*) output, scan_op, prefix, apply_prefix);
-}
-
-
-
-
-
-
-
-
-
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ScanOp>
-__device__ __forceinline__ T ThreadScanInclusive(
-    T                   inclusive,
-    T                   *input,                 ///< [in] Input array
-    T                   *output,                ///< [out] Output array (may be aliased to \p input)
-    ScanOp              scan_op,                ///< [in] Binary scan operator
-    Int2Type<LENGTH>    /*length*/)
-{
-    #pragma unroll
-    for (int i = 0; i < LENGTH; ++i)
-    {
-        inclusive = scan_op(inclusive, input[i]);
-        output[i] = inclusive;
-    }
-
-    return inclusive;
-}
-
-
-/**
- * \brief Perform a sequential inclusive prefix scan over \p LENGTH elements of the \p input array.  The aggregate is returned.
- *
- * \tparam LENGTH     LengthT of \p input and \p output arrays
- * \tparam T          <b>[inferred]</b> The data type to be scanned.
- * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
- */
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ScanOp>
-__device__ __forceinline__ T ThreadScanInclusive(
-    T           *input,                 ///< [in] Input array
-    T           *output,                ///< [out] Output array (may be aliased to \p input)
-    ScanOp      scan_op)                ///< [in] Binary scan operator
-{
-    T inclusive = input[0];
-    output[0] = inclusive;
-
-    // Continue scan
-    return ThreadScanInclusive(inclusive, input + 1, output + 1, scan_op, Int2Type<LENGTH - 1>());
-}
-
-
-/**
- * \brief Perform a sequential inclusive prefix scan over the statically-sized \p input array.  The aggregate is returned.
- *
- * \tparam LENGTH     <b>[inferred]</b> LengthT of \p input and \p output arrays
- * \tparam T          <b>[inferred]</b> The data type to be scanned.
- * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
- */
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ScanOp>
-__device__ __forceinline__ T ThreadScanInclusive(
-    T           (&input)[LENGTH],       ///< [in] Input array
-    T           (&output)[LENGTH],      ///< [out] Output array (may be aliased to \p input)
-    ScanOp      scan_op)                ///< [in] Binary scan operator
-{
-    return ThreadScanInclusive<LENGTH>((T*) input, (T*) output, scan_op);
-}
-
-
-/**
- * \brief Perform a sequential inclusive prefix scan over \p LENGTH elements of the \p input array, seeded with the specified \p prefix.  The aggregate is returned.
- *
- * \tparam LENGTH     LengthT of \p input and \p output arrays
- * \tparam T          <b>[inferred]</b> The data type to be scanned.
- * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
- */
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ScanOp>
-__device__ __forceinline__ T ThreadScanInclusive(
-    T           *input,                 ///< [in] Input array
-    T           *output,                ///< [out] Output array (may be aliased to \p input)
-    ScanOp      scan_op,                ///< [in] Binary scan operator
-    T           prefix,                 ///< [in] Prefix to seed scan with
-    bool        apply_prefix = true)    ///< [in] Whether or not the calling thread should apply its prefix.  (Handy for preventing thread-0 from applying a prefix.)
-{
-    T inclusive = input[0];
-    if (apply_prefix)
-    {
-        inclusive = scan_op(prefix, inclusive);
-    }
-    output[0] = inclusive;
-
-    // Continue scan
-    return ThreadScanInclusive(inclusive, input + 1, output + 1, scan_op, Int2Type<LENGTH - 1>());
-}
-
-
-/**
- * \brief Perform a sequential inclusive prefix scan over the statically-sized \p input array, seeded with the specified \p prefix.  The aggregate is returned.
- *
- * \tparam LENGTH     <b>[inferred]</b> LengthT of \p input and \p output arrays
- * \tparam T          <b>[inferred]</b> The data type to be scanned.
- * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
- */
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ScanOp>
-__device__ __forceinline__ T ThreadScanInclusive(
-    T           (&input)[LENGTH],       ///< [in] Input array
-    T           (&output)[LENGTH],      ///< [out] Output array (may be aliased to \p input)
-    ScanOp      scan_op,                ///< [in] Binary scan operator
-    T           prefix,                 ///< [in] Prefix to seed scan with
-    bool        apply_prefix = true)    ///< [in] Whether or not the calling thread should apply its prefix.  (Handy for preventing thread-0 from applying a prefix.)
-{
-    return ThreadScanInclusive<LENGTH>((T*) input, (T*) output, scan_op, prefix, apply_prefix);
-}
-
-
-//@}  end member group
-
-/** @} */       // end group UtilModule
-
-
-}               // internal namespace
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/system/cuda/detail/cub/thread/thread_search.cuh b/thrust/system/cuda/detail/cub/thread/thread_search.cuh
deleted file mode 100644
index 3fcdd628f..000000000
--- a/thrust/system/cuda/detail/cub/thread/thread_search.cuh
+++ /dev/null
@@ -1,154 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Thread utilities for sequential search
- */
-
-#pragma once
-
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * Computes the begin offsets into A and B for the specific diagonal
- */
-template <
-    typename AIteratorT,
-    typename BIteratorT,
-    typename OffsetT,
-    typename CoordinateT>
-__host__ __device__ __forceinline__ void MergePathSearch(
-    OffsetT         diagonal,
-    AIteratorT      a,
-    BIteratorT      b,
-    OffsetT         a_len,
-    OffsetT         b_len,
-    CoordinateT&    path_coordinate)
-{
-    /// The value type of the input iterator
-    typedef typename std::iterator_traits<AIteratorT>::value_type T;
-
-    OffsetT split_min = CUB_MAX(diagonal - b_len, 0);
-    OffsetT split_max = CUB_MIN(diagonal, a_len);
-
-    while (split_min < split_max)
-    {
-        OffsetT split_pivot = (split_min + split_max) >> 1;
-        if (a[split_pivot] <= b[diagonal - split_pivot - 1])
-        {
-            // Move candidate split range up A, down B
-            split_min = split_pivot + 1;
-        }
-        else
-        {
-            // Move candidate split range up B, down A
-            split_max = split_pivot;
-        }
-    }
-
-    path_coordinate.x = CUB_MIN(split_min, a_len);
-    path_coordinate.y = diagonal - split_min;
-}
-
-
-
-/**
- * \brief Returns the offset of the first value within \p input which does not compare less than \p val
- */
-template <
-    typename InputIteratorT,
-    typename OffsetT,
-    typename T>
-__device__ __forceinline__ OffsetT LowerBound(
-    InputIteratorT      input,              ///< [in] Input sequence
-    OffsetT             num_items,          ///< [in] Input sequence length
-    T                   val)                ///< [in] Search key
-{
-    OffsetT retval = 0;
-    while (num_items > 0)
-    {
-        OffsetT half = num_items >> 1;
-        if (input[retval + half] < val)
-        {
-            retval = retval + (half + 1);
-            num_items = num_items - (half + 1);
-        }
-        else
-        {
-            num_items = half;
-        }
-    }
-
-    return retval;
-}
-
-
-/**
- * \brief Returns the offset of the first value within \p input which compares greater than \p val
- */
-template <
-    typename InputIteratorT,
-    typename OffsetT,
-    typename T>
-__device__ __forceinline__ OffsetT UpperBound(
-    InputIteratorT      input,              ///< [in] Input sequence
-    OffsetT             num_items,          ///< [in] Input sequence length
-    T                   val)                ///< [in] Search key
-{
-    OffsetT retval = 0;
-    while (num_items > 0)
-    {
-        OffsetT half = num_items >> 1;
-        if (val < input[retval + half])
-        {
-            num_items = half;
-        }
-        else
-        {
-            retval = retval + (half + 1);
-            num_items = num_items - (half + 1);
-        }
-    }
-
-    return retval;
-}
-
-
-
-
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/system/cuda/detail/cub/thread/thread_store.cuh b/thrust/system/cuda/detail/cub/thread/thread_store.cuh
deleted file mode 100644
index ca4fbd2f4..000000000
--- a/thrust/system/cuda/detail/cub/thread/thread_store.cuh
+++ /dev/null
@@ -1,422 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Thread utilities for writing memory using PTX cache modifiers.
- */
-
-#pragma once
-
-//#include <cuda.h>
-
-#include "../util_ptx.cuh"
-#include "../util_type.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \addtogroup UtilIo
- * @{
- */
-
-
-//-----------------------------------------------------------------------------
-// Tags and constants
-//-----------------------------------------------------------------------------
-
-/**
- * \brief Enumeration of cache modifiers for memory store operations.
- */
-enum CacheStoreModifier
-{
-    STORE_DEFAULT,              ///< Default (no modifier)
-    STORE_WB,                   ///< Cache write-back all coherent levels
-    STORE_CG,                   ///< Cache at global level
-    STORE_CS,                   ///< Cache streaming (likely to be accessed once)
-    STORE_WT,                   ///< Cache write-through (to system memory)
-    STORE_VOLATILE,             ///< Volatile shared (any memory space)
-};
-
-
-/**
- * \name Thread I/O (cache modified)
- * @{
- */
-
-/**
- * \brief Thread utility for writing memory using cub::CacheStoreModifier cache modifiers.  Can be used to store any data type.
- *
- * \par Example
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/thread/thread_store.cuh>
- *
- * // 32-bit store using cache-global modifier:
- * int *d_out;
- * int val;
- * cub::ThreadStore<cub::STORE_CG>(d_out + threadIdx.x, val);
- *
- * // 16-bit store using default modifier
- * short *d_out;
- * short val;
- * cub::ThreadStore<cub::STORE_DEFAULT>(d_out + threadIdx.x, val);
- *
- * // 256-bit store using write-through modifier
- * double4 *d_out;
- * double4 val;
- * cub::ThreadStore<cub::STORE_WT>(d_out + threadIdx.x, val);
- *
- * // 96-bit store using cache-streaming cache modifier
- * struct TestFoo { bool a; short b; };
- * TestFoo *d_struct;
- * TestFoo val;
- * cub::ThreadStore<cub::STORE_CS>(d_out + threadIdx.x, val);
- * \endcode
- *
- * \tparam MODIFIER             <b>[inferred]</b> CacheStoreModifier enumeration
- * \tparam InputIteratorT       <b>[inferred]</b> Output iterator type \iterator
- * \tparam T                    <b>[inferred]</b> Data type of output value
- */
-template <
-    CacheStoreModifier  MODIFIER,
-    typename            OutputIteratorT,
-    typename            T>
-__device__ __forceinline__ void ThreadStore(OutputIteratorT itr, T val);
-
-
-//@}  end member group
-
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-
-/// Helper structure for templated store iteration (inductive case)
-template <int COUNT, int MAX>
-struct IterateThreadStore
-{
-    template <CacheStoreModifier MODIFIER, typename T>
-    static __device__ __forceinline__ void Store(T *ptr, T *vals)
-    {
-        ThreadStore<MODIFIER>(ptr + COUNT, vals[COUNT]);
-        IterateThreadStore<COUNT + 1, MAX>::template Store<MODIFIER>(ptr, vals);
-    }
-
-    template <typename OutputIteratorT, typename T>
-    static __device__ __forceinline__ void Dereference(OutputIteratorT ptr, T *vals)
-    {
-        ptr[COUNT] = vals[COUNT];
-        IterateThreadStore<COUNT + 1, MAX>::Dereference(ptr, vals);
-    }
-
-};
-
-/// Helper structure for templated store iteration (termination case)
-template <int MAX>
-struct IterateThreadStore<MAX, MAX>
-{
-    template <CacheStoreModifier MODIFIER, typename T>
-    static __device__ __forceinline__ void Store(T * /*ptr*/, T * /*vals*/) {}
-
-    template <typename OutputIteratorT, typename T>
-    static __device__ __forceinline__ void Dereference(OutputIteratorT /*ptr*/, T * /*vals*/) {}
-};
-
-
-/**
- * Define a uint4 (16B) ThreadStore specialization for the given Cache load modifier
- */
-#define _CUB_STORE_16(cub_modifier, ptx_modifier)                                            \
-    template<>                                                                              \
-    __device__ __forceinline__ void ThreadStore<cub_modifier, uint4*, uint4>(uint4* ptr, uint4 val)                         \
-    {                                                                                       \
-        asm volatile ("st."#ptx_modifier".v4.u32 [%0], {%1, %2, %3, %4};" : :               \
-            _CUB_ASM_PTR_(ptr),                                                             \
-            "r"(val.x),                                                                     \
-            "r"(val.y),                                                                     \
-            "r"(val.z),                                                                     \
-            "r"(val.w));                                                                    \
-    }                                                                                       \
-    template<>                                                                              \
-    __device__ __forceinline__ void ThreadStore<cub_modifier, ulonglong2*, ulonglong2>(ulonglong2* ptr, ulonglong2 val)     \
-    {                                                                                       \
-        asm volatile ("st."#ptx_modifier".v2.u64 [%0], {%1, %2};" : :                       \
-            _CUB_ASM_PTR_(ptr),                                                             \
-            "l"(val.x),                                                                     \
-            "l"(val.y));                                                                    \
-    }
-
-
-/**
- * Define a uint2 (8B) ThreadStore specialization for the given Cache load modifier
- */
-#define _CUB_STORE_8(cub_modifier, ptx_modifier)                                             \
-    template<>                                                                              \
-    __device__ __forceinline__ void ThreadStore<cub_modifier, ushort4*, ushort4>(ushort4* ptr, ushort4 val)                 \
-    {                                                                                       \
-        asm volatile ("st."#ptx_modifier".v4.u16 [%0], {%1, %2, %3, %4};" : :               \
-            _CUB_ASM_PTR_(ptr),                                                             \
-            "h"(val.x),                                                                     \
-            "h"(val.y),                                                                     \
-            "h"(val.z),                                                                     \
-            "h"(val.w));                                                                    \
-    }                                                                                       \
-    template<>                                                                              \
-    __device__ __forceinline__ void ThreadStore<cub_modifier, uint2*, uint2>(uint2* ptr, uint2 val)                         \
-    {                                                                                       \
-        asm volatile ("st."#ptx_modifier".v2.u32 [%0], {%1, %2};" : :                       \
-            _CUB_ASM_PTR_(ptr),                                                             \
-            "r"(val.x),                                                                     \
-            "r"(val.y));                                                                    \
-    }                                                                                       \
-    template<>                                                                              \
-    __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned long long*, unsigned long long>(unsigned long long* ptr, unsigned long long val)     \
-    {                                                                                       \
-        asm volatile ("st."#ptx_modifier".u64 [%0], %1;" : :                                \
-            _CUB_ASM_PTR_(ptr),                                                             \
-            "l"(val));                                                                      \
-    }
-
-/**
- * Define a unsigned int (4B) ThreadStore specialization for the given Cache load modifier
- */
-#define _CUB_STORE_4(cub_modifier, ptx_modifier)                                             \
-    template<>                                                                              \
-    __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned int*, unsigned int>(unsigned int* ptr, unsigned int val)                             \
-    {                                                                                       \
-        asm volatile ("st."#ptx_modifier".u32 [%0], %1;" : :                                \
-            _CUB_ASM_PTR_(ptr),                                                             \
-            "r"(val));                                                                      \
-    }
-
-
-/**
- * Define a unsigned short (2B) ThreadStore specialization for the given Cache load modifier
- */
-#define _CUB_STORE_2(cub_modifier, ptx_modifier)                                             \
-    template<>                                                                              \
-    __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned short*, unsigned short>(unsigned short* ptr, unsigned short val)                     \
-    {                                                                                       \
-        asm volatile ("st."#ptx_modifier".u16 [%0], %1;" : :                                \
-            _CUB_ASM_PTR_(ptr),                                                             \
-            "h"(val));                                                                      \
-    }
-
-
-/**
- * Define a unsigned char (1B) ThreadStore specialization for the given Cache load modifier
- */
-#define _CUB_STORE_1(cub_modifier, ptx_modifier)                                             \
-    template<>                                                                              \
-    __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned char*, unsigned char>(unsigned char* ptr, unsigned char val)                         \
-    {                                                                                       \
-        asm volatile (                                                                      \
-        "{"                                                                                 \
-        "   .reg .u8 datum;"                                                                \
-        "   cvt.u8.u16 datum, %1;"                                                          \
-        "   st."#ptx_modifier".u8 [%0], datum;"                                             \
-        "}" : :                                                                             \
-            _CUB_ASM_PTR_(ptr),                                                             \
-            "h"((unsigned short) val));                                                               \
-    }
-
-/**
- * Define powers-of-two ThreadStore specializations for the given Cache load modifier
- */
-#define _CUB_STORE_ALL(cub_modifier, ptx_modifier)                                           \
-    _CUB_STORE_16(cub_modifier, ptx_modifier)                                                \
-    _CUB_STORE_8(cub_modifier, ptx_modifier)                                                 \
-    _CUB_STORE_4(cub_modifier, ptx_modifier)                                                 \
-    _CUB_STORE_2(cub_modifier, ptx_modifier)                                                 \
-    _CUB_STORE_1(cub_modifier, ptx_modifier)                                                 \
-
-
-/**
- * Define ThreadStore specializations for the various Cache load modifiers
- */
-#if CUB_PTX_ARCH >= 200
-    _CUB_STORE_ALL(STORE_WB, wb)
-    _CUB_STORE_ALL(STORE_CG, cg)
-    _CUB_STORE_ALL(STORE_CS, cs)
-    _CUB_STORE_ALL(STORE_WT, wt)
-#else
-    _CUB_STORE_ALL(STORE_WB, global)
-    _CUB_STORE_ALL(STORE_CG, global)
-    _CUB_STORE_ALL(STORE_CS, global)
-    _CUB_STORE_ALL(STORE_WT, volatile.global)
-#endif
-
-
-// Macro cleanup
-#undef _CUB_STORE_ALL
-#undef _CUB_STORE_1
-#undef _CUB_STORE_2
-#undef _CUB_STORE_4
-#undef _CUB_STORE_8
-#undef _CUB_STORE_16
-
-
-/**
- * ThreadStore definition for STORE_DEFAULT modifier on iterator types
- */
-template <typename OutputIteratorT, typename T>
-__device__ __forceinline__ void ThreadStore(
-    OutputIteratorT             itr,
-    T                           val,
-    Int2Type<STORE_DEFAULT>     /*modifier*/,
-    Int2Type<false>             /*is_pointer*/)
-{
-    *itr = val;
-}
-
-
-/**
- * ThreadStore definition for STORE_DEFAULT modifier on pointer types
- */
-template <typename T>
-__device__ __forceinline__ void ThreadStore(
-    T                           *ptr,
-    T                           val,
-    Int2Type<STORE_DEFAULT>     /*modifier*/,
-    Int2Type<true>              /*is_pointer*/)
-{
-    *ptr = val;
-}
-
-
-/**
- * ThreadStore definition for STORE_VOLATILE modifier on primitive pointer types
- */
-template <typename T>
-__device__ __forceinline__ void ThreadStoreVolatilePtr(
-    T                           *ptr,
-    T                           val,
-    Int2Type<true>              /*is_primitive*/)
-{
-    *reinterpret_cast<volatile T*>(ptr) = val;
-}
-
-
-/**
- * ThreadStore definition for STORE_VOLATILE modifier on non-primitive pointer types
- */
-template <typename T>
-__device__ __forceinline__ void ThreadStoreVolatilePtr(
-    T                           *ptr,
-    T                           val,
-    Int2Type<false>             /*is_primitive*/)
-{
-    // Create a temporary using shuffle-words, then store using volatile-words
-    typedef typename UnitWord<T>::VolatileWord  VolatileWord;  
-    typedef typename UnitWord<T>::ShuffleWord   ShuffleWord;
-
-    const int VOLATILE_MULTIPLE = sizeof(T) / sizeof(VolatileWord);
-    const int SHUFFLE_MULTIPLE  = sizeof(T) / sizeof(ShuffleWord);
-    
-    VolatileWord words[VOLATILE_MULTIPLE];
-
-    #pragma unroll
-    for (int i = 0; i < SHUFFLE_MULTIPLE; ++i)
-        reinterpret_cast<ShuffleWord*>(words)[i] = reinterpret_cast<ShuffleWord*>(&val)[i];
-
-    IterateThreadStore<0, VOLATILE_MULTIPLE>::template Dereference(
-        reinterpret_cast<volatile VolatileWord*>(ptr),
-        words);
-}
-
-
-/**
- * ThreadStore definition for STORE_VOLATILE modifier on pointer types
- */
-template <typename T>
-__device__ __forceinline__ void ThreadStore(
-    T                           *ptr,
-    T                           val,
-    Int2Type<STORE_VOLATILE>    /*modifier*/,
-    Int2Type<true>              /*is_pointer*/)
-{
-    ThreadStoreVolatilePtr(ptr, val, Int2Type<Traits<T>::PRIMITIVE>());
-}
-
-
-/**
- * ThreadStore definition for generic modifiers on pointer types
- */
-template <typename T, int MODIFIER>
-__device__ __forceinline__ void ThreadStore(
-    T                           *ptr,
-    T                           val,
-    Int2Type<MODIFIER>          /*modifier*/,
-    Int2Type<true>              /*is_pointer*/)
-{
-    // Create a temporary using shuffle-words, then store using device-words
-    typedef typename UnitWord<T>::DeviceWord    DeviceWord;  
-    typedef typename UnitWord<T>::ShuffleWord   ShuffleWord;
-
-    const int DEVICE_MULTIPLE   = sizeof(T) / sizeof(DeviceWord);
-    const int SHUFFLE_MULTIPLE  = sizeof(T) / sizeof(ShuffleWord);
-    
-    DeviceWord words[DEVICE_MULTIPLE];
-
-    #pragma unroll
-    for (int i = 0; i < SHUFFLE_MULTIPLE; ++i)
-        reinterpret_cast<ShuffleWord*>(words)[i] = reinterpret_cast<ShuffleWord*>(&val)[i];
-
-    IterateThreadStore<0, DEVICE_MULTIPLE>::template Store<CacheStoreModifier(MODIFIER)>(
-        reinterpret_cast<DeviceWord*>(ptr),
-        words);
-}
-
-
-/**
- * ThreadStore definition for generic modifiers
- */
-template <CacheStoreModifier MODIFIER, typename OutputIteratorT, typename T>
-__device__ __forceinline__ void ThreadStore(OutputIteratorT itr, T val)
-{
-    ThreadStore(
-        itr,
-        val,
-        Int2Type<MODIFIER>(),
-        Int2Type<IsPointer<OutputIteratorT>::VALUE>());
-}
-
-
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-/** @} */       // end group UtilIo
-
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/system/cuda/detail/cub/util_allocator.cuh b/thrust/system/cuda/detail/cub/util_allocator.cuh
deleted file mode 100644
index 3ed80d3c5..000000000
--- a/thrust/system/cuda/detail/cub/util_allocator.cuh
+++ /dev/null
@@ -1,708 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * Simple caching allocator for device memory allocations. The allocator is
- * thread-safe and capable of managing device allocations on multiple devices.
- ******************************************************************************/
-
-#pragma once
-
-#include "util_namespace.cuh"
-#include "util_debug.cuh"
-
-#include <set>
-#include <map>
-
-#include "host/mutex.cuh"
-#include <math.h>
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup UtilMgmt
- * @{
- */
-
-
-/******************************************************************************
- * CachingDeviceAllocator (host use)
- ******************************************************************************/
-
-/**
- * \brief A simple caching allocator for device memory allocations.
- *
- * \par Overview
- * The allocator is thread-safe and stream-safe and is capable of managing cached
- * device allocations on multiple devices.  It behaves as follows:
- *
- * \par
- * - Allocations from the allocator are associated with an \p active_stream.  Once freed,
- *   the allocation becomes available immediately for reuse within the \p active_stream
- *   with which it was associated with during allocation, and it becomes available for
- *   reuse within other streams when all prior work submitted to \p active_stream has completed.
- * - Allocations are categorized and cached by bin size.  A new allocation request of
- *   a given size will only consider cached allocations within the corresponding bin.
- * - Bin limits progress geometrically in accordance with the growth factor
- *   \p bin_growth provided during construction.  Unused device allocations within
- *   a larger bin cache are not reused for allocation requests that categorize to
- *   smaller bin sizes.
- * - Allocation requests below (\p bin_growth ^ \p min_bin) are rounded up to
- *   (\p bin_growth ^ \p min_bin).
- * - Allocations above (\p bin_growth ^ \p max_bin) are not rounded up to the nearest
- *   bin and are simply freed when they are deallocated instead of being returned
- *   to a bin-cache.
- * - %If the total storage of cached allocations on a given device will exceed
- *   \p max_cached_bytes, allocations for that device are simply freed when they are
- *   deallocated instead of being returned to their bin-cache.
- *
- * \par
- * For example, the default-constructed CachingDeviceAllocator is configured with:
- * - \p bin_growth          = 8
- * - \p min_bin             = 3
- * - \p max_bin             = 7
- * - \p max_cached_bytes    = 6MB - 1B
- *
- * \par
- * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB
- * and sets a maximum of 6,291,455 cached bytes per device
- *
- */
-struct CachingDeviceAllocator
-{
-
-    //---------------------------------------------------------------------
-    // Constants
-    //---------------------------------------------------------------------
-
-    /// Out-of-bounds bin
-    static const unsigned int INVALID_BIN = (unsigned int) -1;
-
-    /// Invalid size
-    static const size_t INVALID_SIZE = (size_t) -1;
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-    /// Invalid device ordinal
-    static const int INVALID_DEVICE_ORDINAL = -1;
-
-    //---------------------------------------------------------------------
-    // Type definitions and helper types
-    //---------------------------------------------------------------------
-
-    /**
-     * Descriptor for device memory allocations
-     */
-    struct BlockDescriptor
-    {
-        void*           d_ptr;              // Device pointer
-        size_t          bytes;              // Size of allocation in bytes
-        unsigned int    bin;                // Bin enumeration
-        int             device;             // device ordinal
-        cudaStream_t    associated_stream;  // Associated associated_stream
-        cudaEvent_t     ready_event;        // Signal when associated stream has run to the point at which this block was freed
-
-        // Constructor (suitable for searching maps for a specific block, given its pointer and device)
-        BlockDescriptor(void *d_ptr, int device) :
-            d_ptr(d_ptr),
-            bytes(0),
-            bin(INVALID_BIN),
-            device(device),
-            associated_stream(0),
-            ready_event(0)
-        {}
-
-        // Constructor (suitable for searching maps for a range of suitable blocks, given a device)
-        BlockDescriptor(int device) :
-            d_ptr(NULL),
-            bytes(0),
-            bin(INVALID_BIN),
-            device(device),
-            associated_stream(0),
-            ready_event(0)
-        {}
-
-        // Comparison functor for comparing device pointers
-        static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b)
-        {
-            if (a.device == b.device)
-                return (a.d_ptr < b.d_ptr);
-            else
-                return (a.device < b.device);
-        }
-
-        // Comparison functor for comparing allocation sizes
-        static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b)
-        {
-            if (a.device == b.device)
-                return (a.bytes < b.bytes);
-            else
-                return (a.device < b.device);
-        }
-    };
-
-    /// BlockDescriptor comparator function interface
-    typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &);
-
-    class TotalBytes {
-    public:
-        size_t free;
-        size_t live;
-        TotalBytes() { free = live = 0; }
-    };
-
-    /// Set type for cached blocks (ordered by size)
-    typedef std::multiset<BlockDescriptor, Compare> CachedBlocks;
-
-    /// Set type for live blocks (ordered by ptr)
-    typedef std::multiset<BlockDescriptor, Compare> BusyBlocks;
-
-    /// Map type of device ordinals to the number of cached bytes cached by each device
-    typedef std::map<int, TotalBytes> GpuCachedBytes;
-
-
-    //---------------------------------------------------------------------
-    // Utility functions
-    //---------------------------------------------------------------------
-
-    /**
-     * Integer pow function for unsigned base and exponent
-     */
-    static unsigned int IntPow(
-        unsigned int base,
-        unsigned int exp)
-    {
-        unsigned int retval = 1;
-        while (exp > 0)
-        {
-            if (exp & 1) {
-                retval = retval * base;        // multiply the result by the current base
-            }
-            base = base * base;                // square the base
-            exp = exp >> 1;                    // divide the exponent in half
-        }
-        return retval;
-    }
-
-
-    /**
-     * Round up to the nearest power-of
-     */
-    void NearestPowerOf(
-        unsigned int    &power,
-        size_t          &rounded_bytes,
-        unsigned int    base,
-        size_t          value)
-    {
-        power = 0;
-        rounded_bytes = 1;
-
-        if (value * base < value)
-        {
-            // Overflow
-            power = sizeof(size_t) * 8;
-            rounded_bytes = size_t(0) - 1;
-            return;
-        }
-
-        while (rounded_bytes < value)
-        {
-            rounded_bytes *= base;
-            power++;
-        }
-    }
-
-
-    //---------------------------------------------------------------------
-    // Fields
-    //---------------------------------------------------------------------
-
-    cub::Mutex      mutex;              /// Mutex for thread-safety
-
-    unsigned int    bin_growth;         /// Geometric growth factor for bin-sizes
-    unsigned int    min_bin;            /// Minimum bin enumeration
-    unsigned int    max_bin;            /// Maximum bin enumeration
-
-    size_t          min_bin_bytes;      /// Minimum bin size
-    size_t          max_bin_bytes;      /// Maximum bin size
-    size_t          max_cached_bytes;   /// Maximum aggregate cached bytes per device
-
-    const bool      skip_cleanup;       /// Whether or not to skip a call to FreeAllCached() when destructor is called.  (The CUDA runtime may have already shut down for statically declared allocators)
-    bool            debug;              /// Whether or not to print (de)allocation events to stdout
-
-    GpuCachedBytes  cached_bytes;       /// Map of device ordinal to aggregate cached bytes on that device
-    CachedBlocks    cached_blocks;      /// Set of cached device allocations available for reuse
-    BusyBlocks      live_blocks;        /// Set of live device allocations currently in use
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-    //---------------------------------------------------------------------
-    // Methods
-    //---------------------------------------------------------------------
-
-    /**
-     * \brief Constructor.
-     */
-    CachingDeviceAllocator(
-        unsigned int    bin_growth,                             ///< Geometric growth factor for bin-sizes
-        unsigned int    min_bin             = 1,                ///< Minimum bin (default is bin_growth ^ 1)
-        unsigned int    max_bin             = INVALID_BIN,      ///< Maximum bin (default is no max bin)
-        size_t          max_cached_bytes    = INVALID_SIZE,     ///< Maximum aggregate cached bytes per device (default is no limit)
-        bool            skip_cleanup        = false,            ///< Whether or not to skip a call to \p FreeAllCached() when the destructor is called (default is to deallocate)
-        bool            debug               = false)            ///< Whether or not to print (de)allocation events to stdout (default is no stderr output)
-    :
-        bin_growth(bin_growth),
-        min_bin(min_bin),
-        max_bin(max_bin),
-        min_bin_bytes(IntPow(bin_growth, min_bin)),
-        max_bin_bytes(IntPow(bin_growth, max_bin)),
-        max_cached_bytes(max_cached_bytes),
-        skip_cleanup(skip_cleanup),
-        debug(debug),
-        cached_blocks(BlockDescriptor::SizeCompare),
-        live_blocks(BlockDescriptor::PtrCompare)
-    {}
-
-
-    /**
-     * \brief Default constructor.
-     *
-     * Configured with:
-     * \par
-     * - \p bin_growth          = 8
-     * - \p min_bin             = 3
-     * - \p max_bin             = 7
-     * - \p max_cached_bytes    = (\p bin_growth ^ \p max_bin) * 3) - 1 = 6,291,455 bytes
-     *
-     * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB and
-     * sets a maximum of 6,291,455 cached bytes per device
-     */
-    CachingDeviceAllocator(
-        bool skip_cleanup = false,
-        bool debug = false)
-    :
-        bin_growth(8),
-        min_bin(3),
-        max_bin(7),
-        min_bin_bytes(IntPow(bin_growth, min_bin)),
-        max_bin_bytes(IntPow(bin_growth, max_bin)),
-        max_cached_bytes((max_bin_bytes * 3) - 1),
-        skip_cleanup(skip_cleanup),
-        debug(debug),
-        cached_blocks(BlockDescriptor::SizeCompare),
-        live_blocks(BlockDescriptor::PtrCompare)
-    {}
-
-
-    /**
-     * \brief Sets the limit on the number bytes this allocator is allowed to cache per device.
-     *
-     * Changing the ceiling of cached bytes does not cause any allocations (in-use or
-     * cached-in-reserve) to be freed.  See \p FreeAllCached().
-     */
-    cudaError_t SetMaxCachedBytes(
-        size_t max_cached_bytes)
-    {
-        // Lock
-        mutex.Lock();
-
-        if (debug) _CubLog("Changing max_cached_bytes (%lld -> %lld)\n", (long long) this->max_cached_bytes, (long long) max_cached_bytes);
-
-        this->max_cached_bytes = max_cached_bytes;
-
-        // Unlock
-        mutex.Unlock();
-
-        return cudaSuccess;
-    }
-
-
-    /**
-     * \brief Provides a suitable allocation of device memory for the given size on the specified device.
-     *
-     * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
-     * with which it was associated with during allocation, and it becomes available for reuse within other
-     * streams when all prior work submitted to \p active_stream has completed.
-     */
-    cudaError_t DeviceAllocate(
-        int             device,             ///< [in] Device on which to place the allocation
-        void            **d_ptr,            ///< [out] Reference to pointer to the allocation
-        size_t          bytes,              ///< [in] Minimum number of bytes for the allocation
-        cudaStream_t    active_stream = 0)  ///< [in] The stream to be associated with this allocation
-    {
-        *d_ptr                          = NULL;
-        int entrypoint_device           = INVALID_DEVICE_ORDINAL;
-        cudaError_t error               = cudaSuccess;
-
-        if (device == INVALID_DEVICE_ORDINAL)
-        {
-            if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error;
-            device = entrypoint_device;
-        }
-
-        // Create a block descriptor for the requested allocation
-        bool found = false;
-        BlockDescriptor search_key(device);
-        search_key.associated_stream = active_stream;
-        NearestPowerOf(search_key.bin, search_key.bytes, bin_growth, bytes);
-
-        if (search_key.bin > max_bin)
-        {
-            // Bin is greater than our maximum bin: allocate the request
-            // exactly and give out-of-bounds bin.  It will not be cached
-            // for reuse when returned.
-            search_key.bin      = INVALID_BIN;
-            search_key.bytes    = bytes;
-        }
-        else
-        {
-            // Search for a suitable cached allocation: lock
-            mutex.Lock();
-
-            if (search_key.bin < min_bin)
-            {
-                // Bin is less than minimum bin: round up
-                search_key.bin      = min_bin;
-                search_key.bytes    = min_bin_bytes;
-            }
-
-            // Iterate through the range of cached blocks on the same device in the same bin
-            CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key);
-            while ((block_itr != cached_blocks.end())
-                    && (block_itr->device == device)
-                    && (block_itr->bin == search_key.bin))
-            {
-                // To prevent races with reusing blocks returned by the host but still
-                // in use by the device, only consider cached blocks that are
-                // either (from the active stream) or (from an idle stream)
-                if ((active_stream == block_itr->associated_stream) ||
-                    (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady))
-                {
-                    // Reuse existing cache block.  Insert into live blocks.
-                    found = true;
-                    search_key = *block_itr;
-                    search_key.associated_stream = active_stream;
-                    live_blocks.insert(search_key);
-
-                    // Remove from free blocks
-                    cached_bytes[device].free -= search_key.bytes;
-                    cached_bytes[device].live += search_key.bytes;
-
-                    if (debug) _CubLog("\tDevice %d reused cached block at %p (%lld bytes) for stream %lld (previously associated with stream %lld).\n",
-                        device, search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long)  block_itr->associated_stream);
-
-                    cached_blocks.erase(block_itr);
-
-                    break;
-                }
-                block_itr++;
-            }
-
-            // Done searching: unlock
-            mutex.Unlock();
-        }
-
-        // Allocate the block if necessary
-        if (!found)
-        {
-            // Set runtime's current device to specified device (entrypoint may not be set)
-            if (device != entrypoint_device)
-            {
-                if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error;
-                if (CubDebug(error = cudaSetDevice(device))) return error;
-            }
-
-            // Attempt to allocate
-            if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes)) == cudaErrorMemoryAllocation)
-            {
-                // The allocation attempt failed: free all cached blocks on device and retry
-                if (debug) _CubLog("\tDevice %d failed to allocate %lld bytes for stream %lld, retrying after freeing cached allocations",
-                      device, (long long) search_key.bytes, (long long) search_key.associated_stream);
-
-                error = cudaSuccess;    // Reset the error we will return
-                cudaGetLastError();     // Reset CUDART's error
-
-                // Lock
-                mutex.Lock();
-
-                // Iterate the range of free blocks on the same device
-                BlockDescriptor free_key(device);
-                CachedBlocks::iterator block_itr = cached_blocks.lower_bound(free_key);
-
-                while ((block_itr != cached_blocks.end()) && (block_itr->device == device))
-                {
-                    // No need to worry about synchronization with the device: cudaFree is
-                    // blocking and will synchronize across all kernels executing
-                    // on the current device
-
-                    // Free device memory and destroy stream event.
-                    if (CubDebug(error = cudaFree(block_itr->d_ptr))) break;
-                    if (CubDebug(error = cudaEventDestroy(block_itr->ready_event))) break;
-
-                    // Reduce balance and erase entry
-                    cached_bytes[device].free -= block_itr->bytes;
-
-                    if (debug) _CubLog("\tDevice %d freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
-                        device, (long long) block_itr->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);
-
-                    cached_blocks.erase(block_itr);
-
-                    block_itr++;
-                }
-
-                // Unlock
-                mutex.Unlock();
-
-                // Return under error
-                if (error) return error;
-
-                // Try to allocate again
-                if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes))) return error;
-            }
-
-            // Create ready event
-            if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming)))
-                return error;
-
-            // Insert into live blocks
-            mutex.Lock();
-            live_blocks.insert(search_key);
-            cached_bytes[device].live += search_key.bytes;
-            mutex.Unlock();
-
-            if (debug) _CubLog("\tDevice %d allocated new device block at %p (%lld bytes associated with stream %lld).\n",
-                      device, search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream);
-
-            // Attempt to revert back to previous device if necessary
-            if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device))
-            {
-                if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
-            }
-        }
-
-        // Copy device pointer to output parameter
-        *d_ptr = search_key.d_ptr;
-
-        if (debug) _CubLog("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n",
-            (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);
-
-        return error;
-    }
-
-
-    /**
-     * \brief Provides a suitable allocation of device memory for the given size on the current device.
-     *
-     * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
-     * with which it was associated with during allocation, and it becomes available for reuse within other
-     * streams when all prior work submitted to \p active_stream has completed.
-     */
-    cudaError_t DeviceAllocate(
-        void            **d_ptr,            ///< [out] Reference to pointer to the allocation
-        size_t          bytes,              ///< [in] Minimum number of bytes for the allocation
-        cudaStream_t    active_stream = 0)  ///< [in] The stream to be associated with this allocation
-    {
-        return DeviceAllocate(INVALID_DEVICE_ORDINAL, d_ptr, bytes, active_stream);
-    }
-
-
-    /**
-     * \brief Frees a live allocation of device memory on the specified device, returning it to the allocator.
-     *
-     * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
-     * with which it was associated with during allocation, and it becomes available for reuse within other
-     * streams when all prior work submitted to \p active_stream has completed.
-     */
-    cudaError_t DeviceFree(
-        int             device,
-        void*           d_ptr)
-    {
-        int entrypoint_device           = INVALID_DEVICE_ORDINAL;
-        cudaError_t error               = cudaSuccess;
-
-        if (device == INVALID_DEVICE_ORDINAL)
-        {
-            if (CubDebug(error = cudaGetDevice(&entrypoint_device)))
-                return error;
-            device = entrypoint_device;
-        }
-
-        // Lock
-        mutex.Lock();
-
-        // Find corresponding block descriptor
-        bool recached = false;
-        BlockDescriptor search_key(d_ptr, device);
-        BusyBlocks::iterator block_itr = live_blocks.find(search_key);
-        if (block_itr != live_blocks.end())
-        {
-            // Remove from live blocks
-            search_key = *block_itr;
-            live_blocks.erase(block_itr);
-            cached_bytes[device].live -= search_key.bytes;
-
-            // Keep the returned allocation if bin is valid and we won't exceed the max cached threshold
-            if ((search_key.bin != INVALID_BIN) && (cached_bytes[device].free + search_key.bytes <= max_cached_bytes))
-            {
-                // Insert returned allocation into free blocks
-                recached = true;
-                cached_blocks.insert(search_key);
-                cached_bytes[device].free += search_key.bytes;
-
-                if (debug) _CubLog("\tDevice %d returned %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",
-                    device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(),
-                    (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);
-            }
-        }
-
-        // Unlock
-        mutex.Unlock();
-
-        // First set to specified device (entrypoint may not be set)
-        if (device != entrypoint_device)
-        {
-            if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error;
-            if (CubDebug(error = cudaSetDevice(device))) return error;
-        }
-
-        if (recached)
-        {
-            // Insert the ready event in the associated stream (must have current device set properly)
-            if (CubDebug(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream))) return error;
-        }
-        else
-        {
-            // Free the allocation from the runtime and cleanup the event.
-            if (CubDebug(error = cudaFree(d_ptr))) return error;
-            if (CubDebug(error = cudaEventDestroy(search_key.ready_event))) return error;
-
-            if (debug) _CubLog("\tDevice %d freed %lld bytes from associated stream %lld.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
-                device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);
-        }
-
-        // Reset device
-        if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device))
-        {
-            if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
-        }
-
-        return error;
-    }
-
-
-    /**
-     * \brief Frees a live allocation of device memory on the current device, returning it to the allocator.
-     *
-     * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
-     * with which it was associated with during allocation, and it becomes available for reuse within other
-     * streams when all prior work submitted to \p active_stream has completed.
-     */
-    cudaError_t DeviceFree(
-        void*           d_ptr)
-    {
-        return DeviceFree(INVALID_DEVICE_ORDINAL, d_ptr);
-    }
-
-
-    /**
-     * \brief Frees all cached device allocations on all devices
-     */
-    cudaError_t FreeAllCached()
-    {
-        cudaError_t error         = cudaSuccess;
-        int entrypoint_device     = INVALID_DEVICE_ORDINAL;
-        int current_device        = INVALID_DEVICE_ORDINAL;
-
-        mutex.Lock();
-
-        while (!cached_blocks.empty())
-        {
-            // Get first block
-            CachedBlocks::iterator begin = cached_blocks.begin();
-
-            // Get entry-point device ordinal if necessary
-            if (entrypoint_device == INVALID_DEVICE_ORDINAL)
-            {
-                if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break;
-            }
-
-            // Set current device ordinal if necessary
-            if (begin->device != current_device)
-            {
-                if (CubDebug(error = cudaSetDevice(begin->device))) break;
-                current_device = begin->device;
-            }
-
-            // Free device memory
-            if (CubDebug(error = cudaFree(begin->d_ptr))) break;
-            if (CubDebug(error = cudaEventDestroy(begin->ready_event))) break;
-
-            // Reduce balance and erase entry
-            cached_bytes[current_device].free -= begin->bytes;
-
-            if (debug) _CubLog("\tDevice %d freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
-                current_device, (long long) begin->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[current_device].free, (long long) live_blocks.size(), (long long) cached_bytes[current_device].live);
-
-            cached_blocks.erase(begin);
-        }
-
-        mutex.Unlock();
-
-        // Attempt to revert back to entry-point device if necessary
-        if (entrypoint_device != INVALID_DEVICE_ORDINAL)
-        {
-            if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
-        }
-
-        return error;
-    }
-
-
-    /**
-     * \brief Destructor
-     */
-    virtual ~CachingDeviceAllocator()
-    {
-        if (!skip_cleanup)
-            FreeAllCached();
-    }
-
-};
-
-
-
-
-/** @} */       // end group UtilMgmt
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/system/cuda/detail/cub/util_arch.cuh b/thrust/system/cuda/detail/cub/util_arch.cuh
deleted file mode 100644
index e869b85b5..000000000
--- a/thrust/system/cuda/detail/cub/util_arch.cuh
+++ /dev/null
@@ -1,151 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Static architectural properties by SM version.
- */
-
-#pragma once
-
-#include "util_namespace.cuh"
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-#if (__CUDACC_VER_MAJOR__ >= 9) && !defined(CUB_USE_COOPERATIVE_GROUPS)
-    #define CUB_USE_COOPERATIVE_GROUPS
-#endif
-
-/// CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host pass).
-#ifndef CUB_PTX_ARCH
-    #ifndef __CUDA_ARCH__
-        #define CUB_PTX_ARCH 0
-    #else
-        #define CUB_PTX_ARCH __CUDA_ARCH__
-    #endif
-#endif
-
-
-/// Whether or not the source targeted by the active compiler pass is allowed to  invoke device kernels or methods from the CUDA runtime API.
-#ifndef CUB_RUNTIME_FUNCTION
-    #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__>= 350 && defined(__CUDACC_RDC__))
-        #define CUB_RUNTIME_ENABLED
-        #define CUB_RUNTIME_FUNCTION __host__ __device__
-    #else
-        #define CUB_RUNTIME_FUNCTION __host__
-    #endif
-#endif
-
-
-/// Number of threads per warp
-#ifndef CUB_LOG_WARP_THREADS
-    #define CUB_LOG_WARP_THREADS(arch)                      \
-        (5)
-    #define CUB_WARP_THREADS(arch)                          \
-        (1 << CUB_LOG_WARP_THREADS(arch))
-
-    #define CUB_PTX_WARP_THREADS        CUB_WARP_THREADS(CUB_PTX_ARCH)
-    #define CUB_PTX_LOG_WARP_THREADS    CUB_LOG_WARP_THREADS(CUB_PTX_ARCH)
-#endif
-
-
-/// Number of smem banks
-#ifndef CUB_LOG_SMEM_BANKS
-    #define CUB_LOG_SMEM_BANKS(arch)                        \
-        ((arch >= 200) ?                                    \
-            (5) :                                           \
-            (4))
-    #define CUB_SMEM_BANKS(arch)                            \
-        (1 << CUB_LOG_SMEM_BANKS(arch))
-
-    #define CUB_PTX_LOG_SMEM_BANKS      CUB_LOG_SMEM_BANKS(CUB_PTX_ARCH)
-    #define CUB_PTX_SMEM_BANKS          CUB_SMEM_BANKS(CUB_PTX_ARCH)
-#endif
-
-
-/// Oversubscription factor
-#ifndef CUB_SUBSCRIPTION_FACTOR
-    #define CUB_SUBSCRIPTION_FACTOR(arch)                   \
-        ((arch >= 300) ?                                    \
-            (5) :                                           \
-            ((arch >= 200) ?                                \
-                (3) :                                       \
-                (10)))
-    #define CUB_PTX_SUBSCRIPTION_FACTOR             CUB_SUBSCRIPTION_FACTOR(CUB_PTX_ARCH)
-#endif
-
-
-/// Prefer padding overhead vs X-way conflicts greater than this threshold
-#ifndef CUB_PREFER_CONFLICT_OVER_PADDING
-    #define CUB_PREFER_CONFLICT_OVER_PADDING(arch)          \
-        ((arch >= 300) ?                                    \
-            (1) :                                           \
-            (4))
-    #define CUB_PTX_PREFER_CONFLICT_OVER_PADDING    CUB_PREFER_CONFLICT_OVER_PADDING(CUB_PTX_ARCH)
-#endif
-
-
-/// Scale down the number of threads to keep same amount of scratch storage as the nominal configuration for 4B data.  Minimum of two warps.
-#ifndef CUB_SCALED_BLOCK_THREADS
-    #define CUB_SCALED_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH)                   \
-        (CUB_MIN(                                                                           \
-            NOMINAL_4B_BLOCK_THREADS,                                                       \
-            CUB_WARP_THREADS(PTX_ARCH) * CUB_MAX(                                           \
-                2,                                                                          \
-                (NOMINAL_4B_BLOCK_THREADS / CUB_WARP_THREADS(PTX_ARCH)) * 4 / sizeof(T))))
-#endif
-
-/// Scale down number of items per thread to keep the same amount of register storage as the nominal configuration for 4B data.  Minimum 1 item per thread
-#ifndef CUB_SCALED_ITEMS_PER_THREAD
-    #define CUB_SCALED_ITEMS_PER_THREAD(NOMINAL_4B_ITEMS_PER_THREAD, NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH)     \
-        CUB_MAX(                                                                                                \
-            1,                                                                                                  \
-            (sizeof(T) < 4) ?                                                                                   \
-                ((NOMINAL_4B_ITEMS_PER_THREAD * NOMINAL_4B_BLOCK_THREADS * 4) / CUB_MAX(4, sizeof(T))) / CUB_SCALED_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH) / 2 :  \
-                ((NOMINAL_4B_ITEMS_PER_THREAD * NOMINAL_4B_BLOCK_THREADS * 4) / CUB_MAX(4, sizeof(T))) / CUB_SCALED_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH))
-#endif
-
-/// Define both nominal threads-per-block and items-per-thread
-#ifndef CUB_SCALED_GRANULARITIES
-    #define CUB_SCALED_GRANULARITIES(NOMINAL_4B_BLOCK_THREADS, NOMINAL_4B_ITEMS_PER_THREAD, T)      \
-        CUB_SCALED_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, 200),                                   \
-        CUB_SCALED_ITEMS_PER_THREAD(NOMINAL_4B_ITEMS_PER_THREAD, NOMINAL_4B_BLOCK_THREADS, T, 200)
-#endif
-
-
-
-#endif  // Do not document
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/system/cuda/detail/cub/util_debug.cuh b/thrust/system/cuda/detail/cub/util_debug.cuh
deleted file mode 100644
index c7074fc8f..000000000
--- a/thrust/system/cuda/detail/cub/util_debug.cuh
+++ /dev/null
@@ -1,145 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Error and event logging routines.
- *
- * The following macros definitions are supported:
- * - \p CUB_LOG.  Simple event messages are printed to \p stdout.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include "util_namespace.cuh"
-#include "util_arch.cuh"
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup UtilMgmt
- * @{
- */
-
-
-/// CUB error reporting macro (prints error messages to stderr)
-#if (defined(DEBUG) || defined(_DEBUG)) && !defined(CUB_STDERR)
-    #define CUB_STDERR
-#endif
-
-
-
-/**
- * \brief %If \p CUB_STDERR is defined and \p error is not \p cudaSuccess, the corresponding error message is printed to \p stderr (or \p stdout in device code) along with the supplied source context.
- *
- * \return The CUDA error.
- */
-__host__ __device__ __forceinline__ cudaError_t Debug(
-    cudaError_t     error,
-    const char*     filename,
-    int             line)
-{
-    (void)filename;
-    (void)line;
-#ifdef CUB_STDERR
-    if (error)
-    {
-    #if (CUB_PTX_ARCH == 0)
-        fprintf(stderr, "CUDA error %d [%s, %d]: %s\n", error, filename, line, cudaGetErrorString(error));
-        fflush(stderr);
-    #elif (CUB_PTX_ARCH >= 200)
-        printf("CUDA error %d [block (%d,%d,%d) thread (%d,%d,%d), %s, %d]\n", error, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, filename, line);
-    #endif
-    }
-#endif
-    return error;
-}
-
-
-/**
- * \brief Debug macro
- */
-#ifndef CubDebug
-    #define CubDebug(e) cub::Debug((cudaError_t) (e), __FILE__, __LINE__)
-#endif
-
-
-/**
- * \brief Debug macro with exit
- */
-#ifndef CubDebugExit
-    #define CubDebugExit(e) if (cub::Debug((cudaError_t) (e), __FILE__, __LINE__)) { exit(1); }
-#endif
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-/**
- * \brief Log macro for printf statements.
- */
-#if !defined(_CubLog)
-    #if !(defined(__clang__) && defined(__CUDA__))
-        #if (CUB_PTX_ARCH == 0)
-            #define _CubLog(format, ...) printf(format,__VA_ARGS__);
-        #elif (CUB_PTX_ARCH >= 200)
-            #define _CubLog(format, ...) printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, __VA_ARGS__);
-        #endif
-    #else
-        // XXX shameless hack for clang around variadic printf...
-        //     Compilies w/o supplying -std=c++11 but shows warning,
-        //     so we sielence them :)
-        #pragma clang diagnostic ignored "-Wc++11-extensions"
-        #pragma clang diagnostic ignored "-Wunnamed-type-template-args"
-            template <class... Args>
-            inline __host__ __device__ void va_printf(char const* format, Args const&... args)
-            {
-        #ifdef __CUDA_ARCH__
-              printf(format, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, args...);
-        #else
-              printf(format, args...);
-        #endif
-            }
-        #ifndef __CUDA_ARCH__
-            #define _CubLog(format, ...) va_printf(format,__VA_ARGS__);
-        #else
-            #define _CubLog(format, ...) va_printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, __VA_ARGS__);
-        #endif
-    #endif
-#endif
-
-
-
-
-/** @} */       // end group UtilMgmt
-
diff --git a/thrust/system/cuda/detail/cub/util_device.cuh b/thrust/system/cuda/detail/cub/util_device.cuh
deleted file mode 100644
index ca55bd530..000000000
--- a/thrust/system/cuda/detail/cub/util_device.cuh
+++ /dev/null
@@ -1,347 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Properties of a given CUDA device and the corresponding PTX bundle
- */
-
-#pragma once
-
-#include "util_type.cuh"
-#include "util_arch.cuh"
-#include "util_debug.cuh"
-#include "util_namespace.cuh"
-#include "util_macro.cuh"
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup UtilMgmt
- * @{
- */
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-
-/**
- * Alias temporaries to externally-allocated device storage (or simply return the amount of storage needed).
- */
-template <int ALLOCATIONS>
-__host__ __device__ __forceinline__
-cudaError_t AliasTemporaries(
-    void    *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-    size_t  &temp_storage_bytes,                ///< [in,out] Size in bytes of \t d_temp_storage allocation
-    void*   (&allocations)[ALLOCATIONS],        ///< [in,out] Pointers to device allocations needed
-    size_t  (&allocation_sizes)[ALLOCATIONS])   ///< [in] Sizes in bytes of device allocations needed
-{
-    const int ALIGN_BYTES   = 256;
-    const int ALIGN_MASK    = ~(ALIGN_BYTES - 1);
-
-    // Compute exclusive prefix sum over allocation requests
-    size_t allocation_offsets[ALLOCATIONS];
-    size_t bytes_needed = 0;
-    for (int i = 0; i < ALLOCATIONS; ++i)
-    {
-        size_t allocation_bytes = (allocation_sizes[i] + ALIGN_BYTES - 1) & ALIGN_MASK;
-        allocation_offsets[i] = bytes_needed;
-        bytes_needed += allocation_bytes;
-    }
-    bytes_needed += ALIGN_BYTES - 1;
-
-    // Check if the caller is simply requesting the size of the storage allocation
-    if (!d_temp_storage)
-    {
-        temp_storage_bytes = bytes_needed;
-        return cudaSuccess;
-    }
-
-    // Check if enough storage provided
-    if (temp_storage_bytes < bytes_needed)
-    {
-        return CubDebug(cudaErrorInvalidValue);
-    }
-
-    // Alias
-    d_temp_storage = (void *) ((size_t(d_temp_storage) + ALIGN_BYTES - 1) & ALIGN_MASK);
-    for (int i = 0; i < ALLOCATIONS; ++i)
-    {
-        allocations[i] = static_cast<char*>(d_temp_storage) + allocation_offsets[i];
-    }
-
-    return cudaSuccess;
-}
-
-
-/**
- * Empty kernel for querying PTX manifest metadata (e.g., version) for the current device
- */
-template <typename T>
-__global__ void EmptyKernel(void) { }
-
-
-#endif  // DOXYGEN_SHOULD_SKIP_THIS
-
-/**
- * \brief Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10)
- */
-CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int &ptx_version)
-{
-    struct Dummy
-    {
-        /// Type definition of the EmptyKernel kernel entry point
-        typedef void (*EmptyKernelPtr)();
-
-        /// Force EmptyKernel<void> to be generated if this class is used
-        CUB_RUNTIME_FUNCTION __forceinline__
-        EmptyKernelPtr Empty()
-        {
-            return EmptyKernel<void>;
-        }
-    };
-
-
-#ifndef CUB_RUNTIME_ENABLED
-    (void)ptx_version;
-
-    // CUDA API calls not supported from this device
-    return cudaErrorInvalidConfiguration;
-
-#elif (CUB_PTX_ARCH > 0)
-
-    ptx_version = CUB_PTX_ARCH;
-    return cudaSuccess;
-
-#else
-
-    cudaError_t error = cudaSuccess;
-    do
-    {
-        cudaFuncAttributes empty_kernel_attrs;
-        if (CubDebug(error = cudaFuncGetAttributes(&empty_kernel_attrs, EmptyKernel<void>))) break;
-        ptx_version = empty_kernel_attrs.ptxVersion * 10;
-    }
-    while (0);
-
-    return error;
-
-#endif
-}
-
-
-/**
- * \brief Retrieves the SM version (major * 100 + minor * 10)
- */
-CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SmVersion(int &sm_version, int device_ordinal)
-{
-#ifndef CUB_RUNTIME_ENABLED
-    (void)sm_version;
-    (void)device_ordinal;
-
-    // CUDA API calls not supported from this device
-    return cudaErrorInvalidConfiguration;
-
-#else
-
-    cudaError_t error = cudaSuccess;
-    do
-    {
-        // Fill in SM version
-        int major, minor;
-        if (CubDebug(error = cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device_ordinal))) break;
-        if (CubDebug(error = cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device_ordinal))) break;
-        sm_version = major * 100 + minor * 10;
-    }
-    while (0);
-
-    return error;
-
-#endif
-}
-
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-/**
- * Synchronize the stream if specified
- */
-CUB_RUNTIME_FUNCTION __forceinline__
-static cudaError_t SyncStream(cudaStream_t stream)
-{
-#if (CUB_PTX_ARCH == 0)
-    return cudaStreamSynchronize(stream);
-#else
-    (void)stream;
-    // Device can't yet sync on a specific stream
-    return cudaDeviceSynchronize();
-#endif
-}
-
-
-/**
- * \brief Computes maximum SM occupancy in thread blocks for executing the given kernel function pointer \p kernel_ptr on the current device with \p block_threads per thread block.
- *
- * \par Snippet
- * The code snippet below illustrates the use of the MaxSmOccupancy function.
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/util_device.cuh>
- *
- * template <typename T>
- * __global__ void ExampleKernel()
- * {
- *     // Allocate shared memory for BlockScan
- *     __shared__ volatile T buffer[4096];
- *
- *        ...
- * }
- *
- *     ...
- *
- * // Determine SM occupancy for ExampleKernel specialized for unsigned char
- * int max_sm_occupancy;
- * MaxSmOccupancy(max_sm_occupancy, ExampleKernel<unsigned char>, 64);
- *
- * // max_sm_occupancy  <-- 4 on SM10
- * // max_sm_occupancy  <-- 8 on SM20
- * // max_sm_occupancy  <-- 12 on SM35
- *
- * \endcode
- *
- */
-template <typename KernelPtr>
-CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t MaxSmOccupancy(
-    int                 &max_sm_occupancy,          ///< [out] maximum number of thread blocks that can reside on a single SM
-    KernelPtr           kernel_ptr,                 ///< [in] Kernel pointer for which to compute SM occupancy
-    int                 block_threads,              ///< [in] Number of threads per thread block
-    int                 dynamic_smem_bytes = 0)
-{
-#ifndef CUB_RUNTIME_ENABLED
-    (void)dynamic_smem_bytes;
-    (void)block_threads;
-    (void)kernel_ptr;
-    (void)max_sm_occupancy;
-
-    // CUDA API calls not supported from this device
-    return CubDebug(cudaErrorInvalidConfiguration);
-
-#else
-
-    return cudaOccupancyMaxActiveBlocksPerMultiprocessor (
-        &max_sm_occupancy,
-        kernel_ptr,
-        block_threads,
-        dynamic_smem_bytes);
-
-#endif  // CUB_RUNTIME_ENABLED
-}
-
-
-/******************************************************************************
- * Policy management
- ******************************************************************************/
-
-/**
- * Kernel dispatch configuration
- */
-struct KernelConfig
-{
-    int block_threads;
-    int items_per_thread;
-    int tile_size;
-    int sm_occupancy;
-
-    CUB_RUNTIME_FUNCTION __forceinline__
-    KernelConfig() : block_threads(0), items_per_thread(0), tile_size(0), sm_occupancy(0) {}
-
-    template <typename AgentPolicyT, typename KernelPtrT>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    cudaError_t Init(KernelPtrT kernel_ptr)
-    {
-        block_threads        = AgentPolicyT::BLOCK_THREADS;
-        items_per_thread     = AgentPolicyT::ITEMS_PER_THREAD;
-        tile_size            = block_threads * items_per_thread;
-        cudaError_t retval   = MaxSmOccupancy(sm_occupancy, kernel_ptr, block_threads);
-        return retval;
-    }
-};
-
-
-
-/// Helper for dispatching into a policy chain
-template <int PTX_VERSION, typename PolicyT, typename PrevPolicyT>
-struct ChainedPolicy
-{
-   /// The policy for the active compiler pass
-   typedef typename If<(CUB_PTX_ARCH < PTX_VERSION), typename PrevPolicyT::ActivePolicy, PolicyT>::Type ActivePolicy;
-
-   /// Specializes and dispatches op in accordance to the first policy in the chain of adequate PTX version
-   template <typename FunctorT>
-   CUB_RUNTIME_FUNCTION __forceinline__
-   static cudaError_t Invoke(int ptx_version, FunctorT &op)
-   {
-       if (ptx_version < PTX_VERSION) {
-           return PrevPolicyT::Invoke(ptx_version, op);
-       }
-       return op.template Invoke<PolicyT>();
-   }
-};
-
-/// Helper for dispatching into a policy chain (end-of-chain specialization)
-template <int PTX_VERSION, typename PolicyT>
-struct ChainedPolicy<PTX_VERSION, PolicyT, PolicyT>
-{
-    /// The policy for the active compiler pass
-    typedef PolicyT ActivePolicy;
-
-    /// Specializes and dispatches op in accordance to the first policy in the chain of adequate PTX version
-    template <typename FunctorT>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Invoke(int /*ptx_version*/, FunctorT &op) {
-        return op.template Invoke<PolicyT>();
-    }
-};
-
-
-
-
-#endif  // Do not document
-
-
-
-
-/** @} */       // end group UtilMgmt
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/system/cuda/detail/cub/util_macro.cuh b/thrust/system/cuda/detail/cub/util_macro.cuh
deleted file mode 100644
index 14bd9b12b..000000000
--- a/thrust/system/cuda/detail/cub/util_macro.cuh
+++ /dev/null
@@ -1,103 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * Common C/C++ macro utilities
- ******************************************************************************/
-
-#pragma once
-
-#include "util_namespace.cuh"
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup UtilModule
- * @{
- */
-
-#ifndef CUB_ALIGN
-    #if defined(_WIN32) || defined(_WIN64)
-        /// Align struct
-        #define CUB_ALIGN(bytes) __declspec(align(32))
-    #else
-        /// Align struct
-        #define CUB_ALIGN(bytes) __attribute__((aligned(bytes)))
-    #endif
-#endif
-
-#ifndef CUB_MAX
-    /// Select maximum(a, b)
-    #define CUB_MAX(a, b) (((b) > (a)) ? (b) : (a))
-#endif
-
-#ifndef CUB_MIN
-    /// Select minimum(a, b)
-    #define CUB_MIN(a, b) (((b) < (a)) ? (b) : (a))
-#endif
-
-#ifndef CUB_QUOTIENT_FLOOR
-    /// Quotient of x/y rounded down to nearest integer
-    #define CUB_QUOTIENT_FLOOR(x, y) ((x) / (y))
-#endif
-
-#ifndef CUB_QUOTIENT_CEILING
-    /// Quotient of x/y rounded up to nearest integer
-    #define CUB_QUOTIENT_CEILING(x, y) (((x) + (y) - 1) / (y))
-#endif
-
-#ifndef CUB_ROUND_UP_NEAREST
-    /// x rounded up to the nearest multiple of y
-    #define CUB_ROUND_UP_NEAREST(x, y) ((((x) + (y) - 1) / (y)) * y)
-#endif
-
-#ifndef CUB_ROUND_DOWN_NEAREST
-    /// x rounded down to the nearest multiple of y
-    #define CUB_ROUND_DOWN_NEAREST(x, y) (((x) / (y)) * y)
-#endif
-
-
-#ifndef CUB_STATIC_ASSERT
-    #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-        #define CUB_CAT_(a, b) a ## b
-        #define CUB_CAT(a, b) CUB_CAT_(a, b)
-    #endif // DOXYGEN_SHOULD_SKIP_THIS
-
-    /// Static assert
-    #define CUB_STATIC_ASSERT(cond, msg) typedef int CUB_CAT(cub_static_assert, __LINE__)[(cond) ? 1 : -1]
-#endif
-
-/** @} */       // end group UtilModule
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/system/cuda/detail/cub/util_ptx.cuh b/thrust/system/cuda/detail/cub/util_ptx.cuh
deleted file mode 100644
index aff170333..000000000
--- a/thrust/system/cuda/detail/cub/util_ptx.cuh
+++ /dev/null
@@ -1,729 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * PTX intrinsics
- */
-
-
-#pragma once
-
-#include "util_type.cuh"
-#include "util_arch.cuh"
-#include "util_namespace.cuh"
-#include "util_debug.cuh"
-
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup UtilPtx
- * @{
- */
-
-
-/******************************************************************************
- * PTX helper macros
- ******************************************************************************/
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-/**
- * Register modifier for pointer-types (for inlining PTX assembly)
- */
-#if defined(_WIN64) || defined(__LP64__)
-    #define __CUB_LP64__ 1
-    // 64-bit register modifier for inlined asm
-    #define _CUB_ASM_PTR_ "l"
-    #define _CUB_ASM_PTR_SIZE_ "u64"
-#else
-    #define __CUB_LP64__ 0
-    // 32-bit register modifier for inlined asm
-    #define _CUB_ASM_PTR_ "r"
-    #define _CUB_ASM_PTR_SIZE_ "u32"
-#endif
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-/******************************************************************************
- * Inlined PTX intrinsics
- ******************************************************************************/
-
-/**
- * \brief Shift-right then add.  Returns (\p x >> \p shift) + \p addend.
- */
-__device__ __forceinline__ unsigned int SHR_ADD(
-    unsigned int x,
-    unsigned int shift,
-    unsigned int addend)
-{
-    unsigned int ret;
-#if CUB_PTX_ARCH >= 200
-    asm ("vshr.u32.u32.u32.clamp.add %0, %1, %2, %3;" :
-        "=r"(ret) : "r"(x), "r"(shift), "r"(addend));
-#else
-    ret = (x >> shift) + addend;
-#endif
-    return ret;
-}
-
-
-/**
- * \brief Shift-left then add.  Returns (\p x << \p shift) + \p addend.
- */
-__device__ __forceinline__ unsigned int SHL_ADD(
-    unsigned int x,
-    unsigned int shift,
-    unsigned int addend)
-{
-    unsigned int ret;
-#if CUB_PTX_ARCH >= 200
-    asm ("vshl.u32.u32.u32.clamp.add %0, %1, %2, %3;" :
-        "=r"(ret) : "r"(x), "r"(shift), "r"(addend));
-#else
-    ret = (x << shift) + addend;
-#endif
-    return ret;
-}
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-/**
- * Bitfield-extract.
- */
-template <typename UnsignedBits, int BYTE_LEN>
-__device__ __forceinline__ unsigned int BFE(
-    UnsignedBits            source,
-    unsigned int            bit_start,
-    unsigned int            num_bits,
-    Int2Type<BYTE_LEN>      /*byte_len*/)
-{
-    unsigned int bits;
-#if CUB_PTX_ARCH >= 200
-    asm ("bfe.u32 %0, %1, %2, %3;" : "=r"(bits) : "r"((unsigned int) source), "r"(bit_start), "r"(num_bits));
-#else
-    const unsigned int MASK = (1 << num_bits) - 1;
-    bits = (source >> bit_start) & MASK;
-#endif
-    return bits;
-}
-
-
-/**
- * Bitfield-extract for 64-bit types.
- */
-template <typename UnsignedBits>
-__device__ __forceinline__ unsigned int BFE(
-    UnsignedBits            source,
-    unsigned int            bit_start,
-    unsigned int            num_bits,
-    Int2Type<8>             /*byte_len*/)
-{
-    const unsigned long long MASK = (1ull << num_bits) - 1;
-    return (source >> bit_start) & MASK;
-}
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-/**
- * \brief Bitfield-extract.  Extracts \p num_bits from \p source starting at bit-offset \p bit_start.  The input \p source may be an 8b, 16b, 32b, or 64b unsigned integer type.
- */
-template <typename UnsignedBits>
-__device__ __forceinline__ unsigned int BFE(
-    UnsignedBits source,
-    unsigned int bit_start,
-    unsigned int num_bits)
-{
-    return BFE(source, bit_start, num_bits, Int2Type<sizeof(UnsignedBits)>());
-}
-
-
-/**
- * \brief Bitfield insert.  Inserts the \p num_bits least significant bits of \p y into \p x at bit-offset \p bit_start.
- */
-__device__ __forceinline__ void BFI(
-    unsigned int &ret,
-    unsigned int x,
-    unsigned int y,
-    unsigned int bit_start,
-    unsigned int num_bits)
-{
-#if CUB_PTX_ARCH >= 200
-    asm ("bfi.b32 %0, %1, %2, %3, %4;" :
-        "=r"(ret) : "r"(y), "r"(x), "r"(bit_start), "r"(num_bits));
-#else
-    x <<= bit_start;
-    unsigned int MASK_X = ((1 << num_bits) - 1) << bit_start;
-    unsigned int MASK_Y = ~MASK_X;
-    ret = (y & MASK_Y) | (x & MASK_X);
-#endif
-}
-
-
-/**
- * \brief Three-operand add.  Returns \p x + \p y + \p z.
- */
-__device__ __forceinline__ unsigned int IADD3(unsigned int x, unsigned int y, unsigned int z)
-{
-#if CUB_PTX_ARCH >= 200
-    asm ("vadd.u32.u32.u32.add %0, %1, %2, %3;" : "=r"(x) : "r"(x), "r"(y), "r"(z));
-#else
-    x = x + y + z;
-#endif
-    return x;
-}
-
-
-/**
- * \brief Byte-permute. Pick four arbitrary bytes from two 32-bit registers, and reassemble them into a 32-bit destination register.  For SM2.0 or later.
- *
- * \par
- * The bytes in the two source registers \p a and \p b are numbered from 0 to 7:
- * {\p b, \p a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}}. For each of the four bytes
- * {b3, b2, b1, b0} selected in the return value, a 4-bit selector is defined within
- * the four lower "nibbles" of \p index: {\p index } = {n7, n6, n5, n4, n3, n2, n1, n0}
- *
- * \par Snippet
- * The code snippet below illustrates byte-permute.
- * \par
- * \code
- * #include <cub/cub.cuh>
- *
- * __global__ void ExampleKernel(...)
- * {
- *     int a        = 0x03020100;
- *     int b        = 0x07060504;
- *     int index    = 0x00007531;
- *
- *     int selected = PRMT(a, b, index);    // 0x07050301
- *
- * \endcode
- *
- */
-__device__ __forceinline__ int PRMT(unsigned int a, unsigned int b, unsigned int index)
-{
-    int ret;
-    asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(a), "r"(b), "r"(index));
-    return ret;
-}
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-/**
- * Sync-threads barrier.
- */
-__device__ __forceinline__ void BAR(int count)
-{
-    asm volatile("bar.sync 1, %0;" : : "r"(count));
-}
-
-/**
- * CTA barrier
- */
-__device__  __forceinline__ void CTA_SYNC()
-{
-    __syncthreads();
-}
-
-
-/**
- * CTA barrier with predicate
- */
-__device__  __forceinline__ int CTA_SYNC_AND(int p)
-{
-    return __syncthreads_and(p);
-}
-
-
-/**
- * Warp barrier
- */
-__device__  __forceinline__ void WARP_SYNC(unsigned int member_mask)
-{
-#ifdef CUB_USE_COOPERATIVE_GROUPS
-    __syncwarp(member_mask);
-#endif
-}
-
-
-/**
- * Warp any
- */
-__device__  __forceinline__ int WARP_ANY(int predicate, unsigned int member_mask)
-{
-#ifdef CUB_USE_COOPERATIVE_GROUPS
-    return __any_sync(member_mask, predicate);
-#else
-    return ::__any(predicate);
-#endif
-}
-
-
-/**
- * Warp any
- */
-__device__  __forceinline__ int WARP_ALL(int predicate, unsigned int member_mask)
-{
-#ifdef CUB_USE_COOPERATIVE_GROUPS
-    return __all_sync(member_mask, predicate);
-#else
-    return ::__all(predicate);
-#endif
-}
-
-
-/**
- * Warp ballot
- */
-__device__  __forceinline__ int WARP_BALLOT(int predicate, unsigned int member_mask)
-{
-#ifdef CUB_USE_COOPERATIVE_GROUPS
-    return __ballot_sync(member_mask, predicate);
-#else
-    return __ballot(predicate);
-#endif
-}
-
-/**
- * Warp synchronous shfl_up
- */
-__device__ __forceinline__ 
-unsigned int SHFL_UP_SYNC(unsigned int word, int src_offset, int first_lane, unsigned int member_mask)
-{
-#ifdef CUB_USE_COOPERATIVE_GROUPS
-    asm volatile("shfl.sync.up.b32 %0, %1, %2, %3, %4;"
-        : "=r"(word) : "r"(word), "r"(src_offset), "r"(first_lane), "r"(member_mask));
-#else
-    asm volatile("shfl.up.b32 %0, %1, %2, %3;"
-        : "=r"(word) : "r"(word), "r"(src_offset), "r"(first_lane));
-#endif
-    return word;
-}
-
-/**
- * Warp synchronous shfl_down
- */
-__device__ __forceinline__ 
-unsigned int SHFL_DOWN_SYNC(unsigned int word, int src_offset, int last_lane, unsigned int member_mask)
-{
-#ifdef CUB_USE_COOPERATIVE_GROUPS
-    asm volatile("shfl.sync.down.b32 %0, %1, %2, %3, %4;"
-        : "=r"(word) : "r"(word), "r"(src_offset), "r"(last_lane), "r"(member_mask));
-#else
-    asm volatile("shfl.down.b32 %0, %1, %2, %3;"
-        : "=r"(word) : "r"(word), "r"(src_offset), "r"(last_lane));
-#endif
-    return word;
-}
-
-/**
- * Warp synchronous shfl_idx
- */
-__device__ __forceinline__ 
-unsigned int SHFL_IDX_SYNC(unsigned int word, int src_lane, int last_lane, unsigned int member_mask)
-{
-#ifdef CUB_USE_COOPERATIVE_GROUPS
-    asm volatile("shfl.sync.idx.b32 %0, %1, %2, %3, %4;"
-        : "=r"(word) : "r"(word), "r"(src_lane), "r"(last_lane), "r"(member_mask));
-#else
-    asm volatile("shfl.idx.b32 %0, %1, %2, %3;"
-        : "=r"(word) : "r"(word), "r"(src_lane), "r"(last_lane));
-#endif
-    return word;
-}
-
-/**
- * Floating point multiply. (Mantissa LSB rounds towards zero.)
- */
-__device__ __forceinline__ float FMUL_RZ(float a, float b)
-{
-    float d;
-    asm ("mul.rz.f32 %0, %1, %2;" : "=f"(d) : "f"(a), "f"(b));
-    return d;
-}
-
-
-/**
- * Floating point multiply-add. (Mantissa LSB rounds towards zero.)
- */
-__device__ __forceinline__ float FFMA_RZ(float a, float b, float c)
-{
-    float d;
-    asm ("fma.rz.f32 %0, %1, %2, %3;" : "=f"(d) : "f"(a), "f"(b), "f"(c));
-    return d;
-}
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-/**
- * \brief Terminates the calling thread
- */
-__device__ __forceinline__ void ThreadExit() {
-    asm volatile("exit;");
-}    
-
-
-/**
- * \brief  Abort execution and generate an interrupt to the host CPU
- */
-__device__ __forceinline__ void ThreadTrap() {
-    asm volatile("trap;");
-}
-
-
-/**
- * \brief Returns the row-major linear thread identifier for a multidimensional thread block
- */
-__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
-{
-    return ((block_dim_z == 1) ? 0 : (threadIdx.z * block_dim_x * block_dim_y)) +
-            ((block_dim_y == 1) ? 0 : (threadIdx.y * block_dim_x)) +
-            threadIdx.x;
-}
-
-
-/**
- * \brief Returns the warp lane ID of the calling thread
- */
-__device__ __forceinline__ unsigned int LaneId()
-{
-    unsigned int ret;
-    asm ("mov.u32 %0, %%laneid;" : "=r"(ret) );
-    return ret;
-}
-
-
-/**
- * \brief Returns the warp ID of the calling thread.  Warp ID is guaranteed to be unique among warps, but may not correspond to a zero-based ranking within the thread block.
- */
-__device__ __forceinline__ unsigned int WarpId()
-{
-    unsigned int ret;
-    asm ("mov.u32 %0, %%warpid;" : "=r"(ret) );
-    return ret;
-}
-
-/**
- * \brief Returns the warp lane mask of all lanes less than the calling thread
- */
-__device__ __forceinline__ unsigned int LaneMaskLt()
-{
-    unsigned int ret;
-    asm ("mov.u32 %0, %%lanemask_lt;" : "=r"(ret) );
-    return ret;
-}
-
-/**
- * \brief Returns the warp lane mask of all lanes less than or equal to the calling thread
- */
-__device__ __forceinline__ unsigned int LaneMaskLe()
-{
-    unsigned int ret;
-    asm ("mov.u32 %0, %%lanemask_le;" : "=r"(ret) );
-    return ret;
-}
-
-/**
- * \brief Returns the warp lane mask of all lanes greater than the calling thread
- */
-__device__ __forceinline__ unsigned int LaneMaskGt()
-{
-    unsigned int ret;
-    asm ("mov.u32 %0, %%lanemask_gt;" : "=r"(ret) );
-    return ret;
-}
-
-/**
- * \brief Returns the warp lane mask of all lanes greater than or equal to the calling thread
- */
-__device__ __forceinline__ unsigned int LaneMaskGe()
-{
-    unsigned int ret;
-    asm ("mov.u32 %0, %%lanemask_ge;" : "=r"(ret) );
-    return ret;
-}
-
-/** @} */       // end group UtilPtx
-
-
-
-
-/**
- * \brief Shuffle-up for any data type.  Each <em>warp-lane<sub>i</sub></em> obtains the value \p input contributed by <em>warp-lane</em><sub><em>i</em>-<tt>src_offset</tt></sub>.  For thread lanes \e i < src_offset, the thread's own \p input is returned to the thread. ![](shfl_up_logo.png)
- * \ingroup WarpModule
- *
- * \par
- * - Available only for SM3.0 or newer
- *
- * \par Snippet
- * The code snippet below illustrates each thread obtaining a \p double value from the
- * predecessor of its predecessor.
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/util_ptx.cuh>
- *
- * __global__ void ExampleKernel(...)
- * {
- *     // Obtain one input item per thread
- *     double thread_data = ...
- *
- *     // Obtain item from two ranks below
- *     double peer_data = ShuffleUp(thread_data, 2, 0, 0xffffffff);
- *
- * \endcode
- * \par
- * Suppose the set of input \p thread_data across the first warp of threads is <tt>{1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}</tt>.
- * The corresponding output \p peer_data will be <tt>{1.0, 2.0, 1.0, 2.0, 3.0, ..., 30.0}</tt>.
- *
- */
-template <typename T>
-__device__ __forceinline__ T ShuffleUp(
-    T               input,              ///< [in] The value to broadcast
-    int             src_offset,         ///< [in] The relative down-offset of the peer to read from
-    int             first_lane,         ///< [in] Index of first lane in segment (typically 0)
-    unsigned int    member_mask)        ///< [in] 32-bit mask of participating warp lanes
-{
-    typedef typename UnitWord<T>::ShuffleWord ShuffleWord;
-
-    const int       WORDS           = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
- 
-    T               output;
-    ShuffleWord     *output_alias   = reinterpret_cast<ShuffleWord *>(&output);
-    ShuffleWord     *input_alias    = reinterpret_cast<ShuffleWord *>(&input);
-
-    unsigned int shuffle_word;
-    shuffle_word = SHFL_UP_SYNC((unsigned int)input_alias[0], src_offset, first_lane, member_mask);
-    output_alias[0] = shuffle_word;
-
-    #pragma unroll
-    for (int WORD = 1; WORD < WORDS; ++WORD)
-    {
-        shuffle_word       = SHFL_UP_SYNC((unsigned int)input_alias[WORD], src_offset, first_lane, member_mask);
-        output_alias[WORD] = shuffle_word;
-    }
-
-    return output;
-}
-
-
-/**
- * \brief Shuffle-down for any data type.  Each <em>warp-lane<sub>i</sub></em> obtains the value \p input contributed by <em>warp-lane</em><sub><em>i</em>+<tt>src_offset</tt></sub>.  For thread lanes \e i >= WARP_THREADS, the thread's own \p input is returned to the thread.  ![](shfl_down_logo.png)
- * \ingroup WarpModule
- *
- * \par
- * - Available only for SM3.0 or newer
- *
- * \par Snippet
- * The code snippet below illustrates each thread obtaining a \p double value from the
- * successor of its successor.
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/util_ptx.cuh>
- *
- * __global__ void ExampleKernel(...)
- * {
- *     // Obtain one input item per thread
- *     double thread_data = ...
- *
- *     // Obtain item from two ranks below
- *     double peer_data = ShuffleDown(thread_data, 2, 31, 0xffffffff);
- *
- * \endcode
- * \par
- * Suppose the set of input \p thread_data across the first warp of threads is <tt>{1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}</tt>.
- * The corresponding output \p peer_data will be <tt>{3.0, 4.0, 5.0, 6.0, 7.0, ..., 32.0}</tt>.
- *
- */
-template <typename T>
-__device__ __forceinline__ T ShuffleDown(
-    T               input,              ///< [in] The value to broadcast
-    int             src_offset,         ///< [in] The relative up-offset of the peer to read from
-    int             last_lane,          ///< [in] Index of first lane in segment (typically 31)
-    unsigned int    member_mask)        ///< [in] 32-bit mask of participating warp lanes
-{
-    typedef typename UnitWord<T>::ShuffleWord ShuffleWord;
-
-    const int       WORDS           = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
-
-    T               output;
-    ShuffleWord     *output_alias   = reinterpret_cast<ShuffleWord *>(&output);
-    ShuffleWord     *input_alias    = reinterpret_cast<ShuffleWord *>(&input);
-
-    unsigned int shuffle_word;
-    shuffle_word    = SHFL_DOWN_SYNC((unsigned int)input_alias[0], src_offset, last_lane, member_mask);
-    output_alias[0] = shuffle_word;
-
-    #pragma unroll
-    for (int WORD = 1; WORD < WORDS; ++WORD)
-    {
-        shuffle_word       = SHFL_DOWN_SYNC((unsigned int)input_alias[WORD], src_offset, last_lane, member_mask);
-        output_alias[WORD] = shuffle_word;
-    }
-
-    return output;
-}
-
-
-/**
- * \brief Shuffle-broadcast for any data type.  Each <em>warp-lane<sub>i</sub></em> obtains the value \p input
- * contributed by <em>warp-lane</em><sub><tt>src_lane</tt></sub>.  For \p src_lane < 0 or \p src_lane >= WARP_THREADS,
- * then the thread's own \p input is returned to the thread. ![](shfl_broadcast_logo.png)
- *
- * \ingroup WarpModule
- *
- * \par
- * - Available only for SM3.0 or newer
- *
- * \par Snippet
- * The code snippet below illustrates each thread obtaining a \p double value from <em>warp-lane</em><sub>0</sub>.
- *
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/util_ptx.cuh>
- *
- * __global__ void ExampleKernel(...)
- * {
- *     // Obtain one input item per thread
- *     double thread_data = ...
- *
- *     // Obtain item from thread 0
- *     double peer_data = ShuffleIndex(thread_data, 0, 32, 0xffffffff);
- *
- * \endcode
- * \par
- * Suppose the set of input \p thread_data across the first warp of threads is <tt>{1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}</tt>.
- * The corresponding output \p peer_data will be <tt>{1.0, 1.0, 1.0, 1.0, 1.0, ..., 1.0}</tt>.
- *
- */
-template <typename T>
-__device__ __forceinline__ T ShuffleIndex(
-    T               input,                  ///< [in] The value to broadcast
-    int             src_lane,               ///< [in] Which warp lane is to do the broadcasting
-    int             logical_warp_threads,   ///< [in] Number of threads per logical warp
-    unsigned int    member_mask)            ///< [in] 32-bit mask of participating warp lanes
-{
-    typedef typename UnitWord<T>::ShuffleWord ShuffleWord;
-
-    const int       WORDS           = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
-
-    T               output;
-    ShuffleWord     *output_alias   = reinterpret_cast<ShuffleWord *>(&output);
-    ShuffleWord     *input_alias    = reinterpret_cast<ShuffleWord *>(&input);
-
-    unsigned int shuffle_word;
-    shuffle_word = SHFL_IDX_SYNC((unsigned int)input_alias[0],
-                                 src_lane,
-                                 logical_warp_threads - 1,
-                                 member_mask);
-
-    output_alias[0] = shuffle_word;
-
-    #pragma unroll
-    for (int WORD = 1; WORD < WORDS; ++WORD)
-    {
-        shuffle_word = SHFL_IDX_SYNC((unsigned int)input_alias[WORD],
-                                     src_lane,
-                                     logical_warp_threads - 1,
-                                     member_mask);
-
-        output_alias[WORD] = shuffle_word;
-    }
-
-    return output;
-}
-
-
-
-/**
- * Compute a 32b mask of threads having the same least-significant
- * LABEL_BITS of \p label as the calling thread.
- */
-template <int LABEL_BITS>
-inline __device__ unsigned int MatchAny(unsigned int label)
-{
-    unsigned int retval;
-
-    // Extract masks of common threads for each bit
-    #pragma unroll
-    for (int BIT = 0; BIT < LABEL_BITS; ++BIT)
-    {
-        unsigned int mask;
-        unsigned int current_bit = 1 << BIT;
-        asm ("{\n"
-            "    .reg .pred p;\n"
-            "    and.b32 %0, %1, %2;"
-            "    setp.eq.u32 p, %0, %2;\n"
-#ifdef CUB_USE_COOPERATIVE_GROUPS
-            "    vote.ballot.sync.b32 %0, p, 0xffffffff;\n"
-#else
-            "    vote.ballot.b32 %0, p;\n"
-#endif
-            "    @!p not.b32 %0, %0;\n"
-            "}\n" : "=r"(mask) : "r"(label), "r"(current_bit));
-
-        // Remove peers who differ
-        retval = (BIT == 0) ? mask : retval & mask;
-    }
-
-    return retval;
-
-//  // VOLTA match
-//    unsigned int retval;
-//    asm ("{\n"
-//         "    match.any.sync.b32 %0, %1, 0xffffffff;\n"
-//         "}\n" : "=r"(retval) : "r"(label));
-//    return retval;
-
-}
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/system/cuda/detail/cub/util_type.cuh b/thrust/system/cuda/detail/cub/util_type.cuh
deleted file mode 100644
index bd3bebd36..000000000
--- a/thrust/system/cuda/detail/cub/util_type.cuh
+++ /dev/null
@@ -1,1167 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Common type manipulation (metaprogramming) utilities
- */
-
-#pragma once
-
-#include <iostream>
-#include <limits>
-#include <cfloat>
-
-#if (__CUDACC_VER_MAJOR__ >= 9)
-    #include <cuda_fp16.h>
-#endif
-
-#include "util_macro.cuh"
-#include "util_arch.cuh"
-#include "util_namespace.cuh"
-
-
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup UtilModule
- * @{
- */
-
-
-
-/******************************************************************************
- * Type equality
- ******************************************************************************/
-
-/**
- * \brief Type selection (<tt>IF ? ThenType : ElseType</tt>)
- */
-template <bool IF, typename ThenType, typename ElseType>
-struct If
-{
-    /// Conditional type result
-    typedef ThenType Type;      // true
-};
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-template <typename ThenType, typename ElseType>
-struct If<false, ThenType, ElseType>
-{
-    typedef ElseType Type;      // false
-};
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-
-/******************************************************************************
- * Conditional types
- ******************************************************************************/
-
-/**
- * \brief Type equality test
- */
-template <typename A, typename B>
-struct Equals
-{
-    enum {
-        VALUE = 0,
-        NEGATE = 1
-    };
-};
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-template <typename A>
-struct Equals <A, A>
-{
-    enum {
-        VALUE = 1,
-        NEGATE = 0
-    };
-};
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-/******************************************************************************
- * Static math
- ******************************************************************************/
-
-/**
- * \brief Statically determine log2(N), rounded up.
- *
- * For example:
- *     Log2<8>::VALUE   // 3
- *     Log2<3>::VALUE   // 2
- */
-template <int N, int CURRENT_VAL = N, int COUNT = 0>
-struct Log2
-{
-    /// Static logarithm value
-    enum { VALUE = Log2<N, (CURRENT_VAL >> 1), COUNT + 1>::VALUE };         // Inductive case
-};
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-template <int N, int COUNT>
-struct Log2<N, 0, COUNT>
-{
-    enum {VALUE = (1 << (COUNT - 1) < N) ?                                  // Base case
-        COUNT :
-        COUNT - 1 };
-};
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-/**
- * \brief Statically determine if N is a power-of-two
- */
-template <int N>
-struct PowerOfTwo
-{
-    enum { VALUE = ((N & (N - 1)) == 0) };
-};
-
-
-
-/******************************************************************************
- * Pointer vs. iterator detection
- ******************************************************************************/
-
-/**
- * \brief Pointer vs. iterator
- */
-template <typename Tp>
-struct IsPointer
-{
-    enum { VALUE = 0 };
-};
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-template <typename Tp>
-struct IsPointer<Tp*>
-{
-    enum { VALUE = 1 };
-};
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-
-/******************************************************************************
- * Qualifier detection
- ******************************************************************************/
-
-/**
- * \brief Volatile modifier test
- */
-template <typename Tp>
-struct IsVolatile
-{
-    enum { VALUE = 0 };
-};
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-template <typename Tp>
-struct IsVolatile<Tp volatile>
-{
-    enum { VALUE = 1 };
-};
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-/******************************************************************************
- * Qualifier removal
- ******************************************************************************/
-
-/**
- * \brief Removes \p const and \p volatile qualifiers from type \p Tp.
- *
- * For example:
- *     <tt>typename RemoveQualifiers<volatile int>::Type         // int;</tt>
- */
-template <typename Tp, typename Up = Tp>
-struct RemoveQualifiers
-{
-    /// Type without \p const and \p volatile qualifiers
-    typedef Up Type;
-};
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-template <typename Tp, typename Up>
-struct RemoveQualifiers<Tp, volatile Up>
-{
-    typedef Up Type;
-};
-
-template <typename Tp, typename Up>
-struct RemoveQualifiers<Tp, const Up>
-{
-    typedef Up Type;
-};
-
-template <typename Tp, typename Up>
-struct RemoveQualifiers<Tp, const volatile Up>
-{
-    typedef Up Type;
-};
-
-
-/******************************************************************************
- * Marker types
- ******************************************************************************/
-
-/**
- * \brief A simple "NULL" marker type
- */
-struct NullType
-{
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-    template <typename T>
-    __host__ __device__ __forceinline__ NullType& operator =(const T&) { return *this; }
-
-    __host__ __device__ __forceinline__ bool operator ==(const NullType&) { return true; }
-
-    __host__ __device__ __forceinline__ bool operator !=(const NullType&) { return false; }
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-};
-
-
-/**
- * \brief Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static call dispatch based on constant integral values)
- */
-template <int A>
-struct Int2Type
-{
-   enum {VALUE = A};
-};
-
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-
-/******************************************************************************
- * Size and alignment
- ******************************************************************************/
-
-/// Structure alignment
-template <typename T>
-struct AlignBytes
-{
-    struct Pad
-    {
-        T       val;
-        char    byte;
-    };
-
-    enum
-    {
-        /// The "true CUDA" alignment of T in bytes
-        ALIGN_BYTES = sizeof(Pad) - sizeof(T)
-    };
-
-    /// The "truly aligned" type
-    typedef T Type;
-};
-
-// Specializations where host C++ compilers (e.g., 32-bit Windows) may disagree
-// with device C++ compilers (EDG) on types passed as template parameters through
-// kernel functions
-
-#define __CUB_ALIGN_BYTES(t, b)         \
-    template <> struct AlignBytes<t>    \
-    { enum { ALIGN_BYTES = b }; typedef __align__(b) t Type; };
-
-__CUB_ALIGN_BYTES(short4, 8)
-__CUB_ALIGN_BYTES(ushort4, 8)
-__CUB_ALIGN_BYTES(int2, 8)
-__CUB_ALIGN_BYTES(uint2, 8)
-__CUB_ALIGN_BYTES(long long, 8)
-__CUB_ALIGN_BYTES(unsigned long long, 8)
-__CUB_ALIGN_BYTES(float2, 8)
-__CUB_ALIGN_BYTES(double, 8)
-#ifdef _WIN32
-    __CUB_ALIGN_BYTES(long2, 8)
-    __CUB_ALIGN_BYTES(ulong2, 8)
-#else
-    __CUB_ALIGN_BYTES(long2, 16)
-    __CUB_ALIGN_BYTES(ulong2, 16)
-#endif
-__CUB_ALIGN_BYTES(int4, 16)
-__CUB_ALIGN_BYTES(uint4, 16)
-__CUB_ALIGN_BYTES(float4, 16)
-__CUB_ALIGN_BYTES(long4, 16)
-__CUB_ALIGN_BYTES(ulong4, 16)
-__CUB_ALIGN_BYTES(longlong2, 16)
-__CUB_ALIGN_BYTES(ulonglong2, 16)
-__CUB_ALIGN_BYTES(double2, 16)
-__CUB_ALIGN_BYTES(longlong4, 16)
-__CUB_ALIGN_BYTES(ulonglong4, 16)
-__CUB_ALIGN_BYTES(double4, 16)
-
-template <typename T> struct AlignBytes<volatile T> : AlignBytes<T> {};
-template <typename T> struct AlignBytes<const T> : AlignBytes<T> {};
-template <typename T> struct AlignBytes<const volatile T> : AlignBytes<T> {};
-
-
-/// Unit-words of data movement
-template <typename T>
-struct UnitWord
-{
-    enum {
-        ALIGN_BYTES = AlignBytes<T>::ALIGN_BYTES
-    };
-
-    template <typename Unit>
-    struct IsMultiple
-    {
-        enum {
-            UNIT_ALIGN_BYTES    = AlignBytes<Unit>::ALIGN_BYTES,
-            IS_MULTIPLE         = (sizeof(T) % sizeof(Unit) == 0) && (ALIGN_BYTES % UNIT_ALIGN_BYTES == 0)
-        };
-    };
-
-    /// Biggest shuffle word that T is a whole multiple of and is not larger than the alignment of T
-    typedef typename If<IsMultiple<int>::IS_MULTIPLE,
-        unsigned int,
-        typename If<IsMultiple<short>::IS_MULTIPLE,
-            unsigned short,
-            unsigned char>::Type>::Type         ShuffleWord;
-
-    /// Biggest volatile word that T is a whole multiple of and is not larger than the alignment of T
-    typedef typename If<IsMultiple<long long>::IS_MULTIPLE,
-        unsigned long long,
-        ShuffleWord>::Type                      VolatileWord;
-
-    /// Biggest memory-access word that T is a whole multiple of and is not larger than the alignment of T
-    typedef typename If<IsMultiple<longlong2>::IS_MULTIPLE,
-        ulonglong2,
-        VolatileWord>::Type                     DeviceWord;
-
-    /// Biggest texture reference word that T is a whole multiple of and is not larger than the alignment of T
-    typedef typename If<IsMultiple<int4>::IS_MULTIPLE,
-        uint4,
-        typename If<IsMultiple<int2>::IS_MULTIPLE,
-            uint2,
-            ShuffleWord>::Type>::Type           TextureWord;
-};
-
-
-// float2 specialization workaround (for SM10-SM13)
-template <>
-struct UnitWord <float2>
-{
-    typedef int         ShuffleWord;
-#if (CUB_PTX_ARCH > 0) && (CUB_PTX_ARCH <= 130)
-    typedef float       VolatileWord;
-    typedef uint2       DeviceWord;
-#else
-    typedef unsigned long long   VolatileWord;
-    typedef unsigned long long   DeviceWord;
-#endif
-    typedef float2      TextureWord;
-};
-
-// float4 specialization workaround (for SM10-SM13)
-template <>
-struct UnitWord <float4>
-{
-    typedef int         ShuffleWord;
-#if (CUB_PTX_ARCH > 0) && (CUB_PTX_ARCH <= 130)
-    typedef float               VolatileWord;
-    typedef uint4               DeviceWord;
-#else
-    typedef unsigned long long  VolatileWord;
-    typedef ulonglong2          DeviceWord;
-#endif
-    typedef float4              TextureWord;
-};
-
-
-// char2 specialization workaround (for SM10-SM13)
-template <>
-struct UnitWord <char2>
-{
-    typedef unsigned short      ShuffleWord;
-#if (CUB_PTX_ARCH > 0) && (CUB_PTX_ARCH <= 130)
-    typedef unsigned short      VolatileWord;
-    typedef short               DeviceWord;
-#else
-    typedef unsigned short      VolatileWord;
-    typedef unsigned short      DeviceWord;
-#endif
-    typedef unsigned short      TextureWord;
-};
-
-
-template <typename T> struct UnitWord<volatile T> : UnitWord<T> {};
-template <typename T> struct UnitWord<const T> : UnitWord<T> {};
-template <typename T> struct UnitWord<const volatile T> : UnitWord<T> {};
-
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-
-/******************************************************************************
- * Vector type inference utilities.
- ******************************************************************************/
-
-/**
- * \brief Exposes a member typedef \p Type that names the corresponding CUDA vector type if one exists.  Otherwise \p Type refers to the CubVector structure itself, which will wrap the corresponding \p x, \p y, etc. vector fields.
- */
-template <typename T, int vec_elements> struct CubVector;
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-enum
-{
-    /// The maximum number of elements in CUDA vector types
-    MAX_VEC_ELEMENTS = 4,
-};
-
-
-/**
- * Generic vector-1 type
- */
-template <typename T>
-struct CubVector<T, 1>
-{
-    T x;
-
-    typedef T BaseType;
-    typedef CubVector<T, 1> Type;
-};
-
-/**
- * Generic vector-2 type
- */
-template <typename T>
-struct CubVector<T, 2>
-{
-    T x;
-    T y;
-
-    typedef T BaseType;
-    typedef CubVector<T, 2> Type;
-};
-
-/**
- * Generic vector-3 type
- */
-template <typename T>
-struct CubVector<T, 3>
-{
-    T x;
-    T y;
-    T z;
-
-    typedef T BaseType;
-    typedef CubVector<T, 3> Type;
-};
-
-/**
- * Generic vector-4 type
- */
-template <typename T>
-struct CubVector<T, 4>
-{
-    T x;
-    T y;
-    T z;
-    T w;
-
-    typedef T BaseType;
-    typedef CubVector<T, 4> Type;
-};
-
-
-/**
- * Macro for expanding partially-specialized built-in vector types
- */
-#define CUB_DEFINE_VECTOR_TYPE(base_type,short_type)                                                    \
-                                                                                                        \
-    template<> struct CubVector<base_type, 1> : short_type##1                                           \
-    {                                                                                                   \
-      typedef base_type       BaseType;                                                                 \
-      typedef short_type##1   Type;                                                                     \
-      __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const {           \
-          CubVector retval;                                                                             \
-          retval.x = x + other.x;                                                                       \
-          return retval;                                                                                \
-      }                                                                                                 \
-      __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const {           \
-          CubVector retval;                                                                             \
-          retval.x = x - other.x;                                                                       \
-          return retval;                                                                                \
-      }                                                                                                 \
-    };                                                                                                  \
-                                                                                                        \
-    template<> struct CubVector<base_type, 2> : short_type##2                                           \
-    {                                                                                                   \
-        typedef base_type       BaseType;                                                               \
-        typedef short_type##2   Type;                                                                   \
-        __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const {         \
-            CubVector retval;                                                                           \
-            retval.x = x + other.x;                                                                     \
-            retval.y = y + other.y;                                                                     \
-            return retval;                                                                              \
-        }                                                                                               \
-        __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const {         \
-            CubVector retval;                                                                           \
-            retval.x = x - other.x;                                                                     \
-            retval.y = y - other.y;                                                                     \
-            return retval;                                                                              \
-        }                                                                                               \
-    };                                                                                                  \
-                                                                                                        \
-    template<> struct CubVector<base_type, 3> : short_type##3                                           \
-    {                                                                                                   \
-        typedef base_type       BaseType;                                                               \
-        typedef short_type##3   Type;                                                                   \
-        __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const {         \
-            CubVector retval;                                                                           \
-            retval.x = x + other.x;                                                                     \
-            retval.y = y + other.y;                                                                     \
-            retval.z = z + other.z;                                                                     \
-            return retval;                                                                              \
-        }                                                                                               \
-        __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const {         \
-            CubVector retval;                                                                           \
-            retval.x = x - other.x;                                                                     \
-            retval.y = y - other.y;                                                                     \
-            retval.z = z - other.z;                                                                     \
-            return retval;                                                                              \
-        }                                                                                               \
-    };                                                                                                  \
-                                                                                                        \
-    template<> struct CubVector<base_type, 4> : short_type##4                                           \
-    {                                                                                                   \
-        typedef base_type       BaseType;                                                               \
-        typedef short_type##4   Type;                                                                   \
-        __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const {         \
-            CubVector retval;                                                                           \
-            retval.x = x + other.x;                                                                     \
-            retval.y = y + other.y;                                                                     \
-            retval.z = z + other.z;                                                                     \
-            retval.w = w + other.w;                                                                     \
-            return retval;                                                                              \
-        }                                                                                               \
-        __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const {         \
-            CubVector retval;                                                                           \
-            retval.x = x - other.x;                                                                     \
-            retval.y = y - other.y;                                                                     \
-            retval.z = z - other.z;                                                                     \
-            retval.w = w - other.w;                                                                     \
-            return retval;                                                                              \
-        }                                                                                               \
-    };
-
-
-
-// Expand CUDA vector types for built-in primitives
-CUB_DEFINE_VECTOR_TYPE(char,               char)
-CUB_DEFINE_VECTOR_TYPE(signed char,        char)
-CUB_DEFINE_VECTOR_TYPE(short,              short)
-CUB_DEFINE_VECTOR_TYPE(int,                int)
-CUB_DEFINE_VECTOR_TYPE(long,               long)
-CUB_DEFINE_VECTOR_TYPE(long long,          longlong)
-CUB_DEFINE_VECTOR_TYPE(unsigned char,      uchar)
-CUB_DEFINE_VECTOR_TYPE(unsigned short,     ushort)
-CUB_DEFINE_VECTOR_TYPE(unsigned int,       uint)
-CUB_DEFINE_VECTOR_TYPE(unsigned long,      ulong)
-CUB_DEFINE_VECTOR_TYPE(unsigned long long, ulonglong)
-CUB_DEFINE_VECTOR_TYPE(float,              float)
-CUB_DEFINE_VECTOR_TYPE(double,             double)
-CUB_DEFINE_VECTOR_TYPE(bool,               uchar)
-
-// Undefine macros
-#undef CUB_DEFINE_VECTOR_TYPE
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-
-/******************************************************************************
- * Wrapper types
- ******************************************************************************/
-
-/**
- * \brief A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions
- */
-template <typename T>
-struct Uninitialized
-{
-    /// Biggest memory-access word that T is a whole multiple of and is not larger than the alignment of T
-    typedef typename UnitWord<T>::DeviceWord DeviceWord;
-
-    enum
-    {
-        WORDS = sizeof(T) / sizeof(DeviceWord)
-    };
-
-    /// Backing storage
-    DeviceWord storage[WORDS];
-
-    /// Alias
-    __host__ __device__ __forceinline__ T& Alias()
-    {
-        return reinterpret_cast<T&>(*this);
-    }
-};
-
-
-/**
- * \brief A key identifier paired with a corresponding value
- */
-template <
-    typename    _Key,
-    typename    _Value
-#if defined(_WIN32) && !defined(_WIN64)
-    , bool KeyIsLT = (AlignBytes<_Key>::ALIGN_BYTES < AlignBytes<_Value>::ALIGN_BYTES)
-    , bool ValIsLT = (AlignBytes<_Value>::ALIGN_BYTES < AlignBytes<_Key>::ALIGN_BYTES)
-#endif // #if defined(_WIN32) && !defined(_WIN64)
-    >
-struct KeyValuePair
-{
-    typedef _Key    Key;                ///< Key data type
-    typedef _Value  Value;              ///< Value data type
-
-    Key     key;                        ///< Item key
-    Value   value;                      ///< Item value
-
-    /// Constructor
-    __host__ __device__ __forceinline__
-    KeyValuePair() {}
-
-    /// Constructor
-    __host__ __device__ __forceinline__
-    KeyValuePair(Key const& key, Value const& value) : key(key), value(value) {}
-
-    /// Inequality operator
-    __host__ __device__ __forceinline__ bool operator !=(const KeyValuePair &b)
-    {
-        return (value != b.value) || (key != b.key);
-    }
-};
-
-#if defined(_WIN32) && !defined(_WIN64)
-
-/**
- * Win32 won't do 16B alignment.  This can present two problems for
- * should-be-16B-aligned (but actually 8B aligned) built-in and intrinsics members:
- * 1) If a smaller-aligned item were to be listed first, the host compiler places the
- *    should-be-16B item at too early an offset (and disagrees with device compiler)
- * 2) Or, if a smaller-aligned item lists second, the host compiler gets the size
- *    of the struct wrong (and disagrees with device compiler)
- *
- * So we put the larger-should-be-aligned item first, and explicitly pad the
- * end of the struct
- */
-
-/// Smaller key specialization
-template <typename K, typename V>
-struct KeyValuePair<K, V, true, false>
-{
-    typedef K Key;
-    typedef V Value;
-
-    typedef char Pad[AlignBytes<V>::ALIGN_BYTES - AlignBytes<K>::ALIGN_BYTES];
-
-    Value   value;  // Value has larger would-be alignment and goes first
-    Key     key;
-    Pad     pad;
-
-    /// Constructor
-    __host__ __device__ __forceinline__
-    KeyValuePair() {}
-
-    /// Constructor
-    __host__ __device__ __forceinline__
-    KeyValuePair(Key const& key, Value const& value) : key(key), value(value) {}
-
-    /// Inequality operator
-    __host__ __device__ __forceinline__ bool operator !=(const KeyValuePair &b)
-    {
-        return (value != b.value) || (key != b.key);
-    }
-};
-
-
-/// Smaller value specialization
-template <typename K, typename V>
-struct KeyValuePair<K, V, false, true>
-{
-    typedef K Key;
-    typedef V Value;
-
-    typedef char Pad[AlignBytes<K>::ALIGN_BYTES - AlignBytes<V>::ALIGN_BYTES];
-
-    Key     key;    // Key has larger would-be alignment and goes first
-    Value   value;
-    Pad     pad;
-
-    /// Constructor
-    __host__ __device__ __forceinline__
-    KeyValuePair() {}
-
-    /// Constructor
-    __host__ __device__ __forceinline__
-    KeyValuePair(Key const& key, Value const& value) : key(key), value(value) {}
-
-    /// Inequality operator
-    __host__ __device__ __forceinline__ bool operator !=(const KeyValuePair &b)
-    {
-        return (value != b.value) || (key != b.key);
-    }
-};
-
-#endif // #if defined(_WIN32) && !defined(_WIN64)
-
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-
-/**
- * \brief A wrapper for passing simple static arrays as kernel parameters
- */
-template <typename T, int COUNT>
-struct ArrayWrapper
-{
-
-    /// Statically-sized array of type \p T
-    T array[COUNT];
-
-    /// Constructor
-    __host__ __device__ __forceinline__ ArrayWrapper() {}
-};
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-/**
- * \brief Double-buffer storage wrapper for multi-pass stream transformations that require more than one storage array for streaming intermediate results back and forth.
- *
- * Many multi-pass computations require a pair of "ping-pong" storage
- * buffers (e.g., one for reading from and the other for writing to, and then
- * vice-versa for the subsequent pass).  This structure wraps a set of device
- * buffers and a "selector" member to track which is "current".
- */
-template <typename T>
-struct DoubleBuffer
-{
-    /// Pair of device buffer pointers
-    T *d_buffers[2];
-
-    ///  Selector into \p d_buffers (i.e., the active/valid buffer)
-    int selector;
-
-    /// \brief Constructor
-    __host__ __device__ __forceinline__ DoubleBuffer()
-    {
-        selector = 0;
-        d_buffers[0] = NULL;
-        d_buffers[1] = NULL;
-    }
-
-    /// \brief Constructor
-    __host__ __device__ __forceinline__ DoubleBuffer(
-        T *d_current,         ///< The currently valid buffer
-        T *d_alternate)       ///< Alternate storage buffer of the same size as \p d_current
-    {
-        selector = 0;
-        d_buffers[0] = d_current;
-        d_buffers[1] = d_alternate;
-    }
-
-    /// \brief Return pointer to the currently valid buffer
-    __host__ __device__ __forceinline__ T* Current() { return d_buffers[selector]; }
-
-    /// \brief Return pointer to the currently invalid buffer
-    __host__ __device__ __forceinline__ T* Alternate() { return d_buffers[selector ^ 1]; }
-
-};
-
-
-
-/******************************************************************************
- * Typedef-detection
- ******************************************************************************/
-
-
-/**
- * \brief Defines a structure \p detector_name that is templated on type \p T.  The \p detector_name struct exposes a constant member \p VALUE indicating whether or not parameter \p T exposes a nested type \p nested_type_name
- */
-#define CUB_DEFINE_DETECT_NESTED_TYPE(detector_name, nested_type_name)  \
-    template <typename T>                                               \
-    struct detector_name                                                \
-    {                                                                   \
-        template <typename C>                                           \
-        static char& test(typename C::nested_type_name*);               \
-        template <typename>                                             \
-        static int& test(...);                                          \
-        enum                                                            \
-        {                                                               \
-            VALUE = sizeof(test<T>(0)) < sizeof(int)                    \
-        };                                                              \
-    };
-
-
-
-/******************************************************************************
- * Simple enable-if (similar to Boost)
- ******************************************************************************/
-
-/**
- * \brief Simple enable-if (similar to Boost)
- */
-template <bool Condition, class T = void>
-struct EnableIf
-{
-    /// Enable-if type for SFINAE dummy variables
-    typedef T Type;
-};
-
-
-template <class T>
-struct EnableIf<false, T> {};
-
-
-
-/******************************************************************************
- * Typedef-detection
- ******************************************************************************/
-
-/**
- * \brief Determine whether or not BinaryOp's functor is of the form <tt>bool operator()(const T& a, const T&b)</tt> or <tt>bool operator()(const T& a, const T&b, unsigned int idx)</tt>
- */
-template <typename T, typename BinaryOp>
-struct BinaryOpHasIdxParam
-{
-private:
-/*
-    template <typename BinaryOpT, bool (BinaryOpT::*)(const T &a, const T &b, unsigned int idx) const>  struct SFINAE1 {};
-    template <typename BinaryOpT, bool (BinaryOpT::*)(const T &a, const T &b, unsigned int idx)>        struct SFINAE2 {};
-    template <typename BinaryOpT, bool (BinaryOpT::*)(T a, T b, unsigned int idx) const>                struct SFINAE3 {};
-    template <typename BinaryOpT, bool (BinaryOpT::*)(T a, T b, unsigned int idx)>                      struct SFINAE4 {};
-*/
-    template <typename BinaryOpT, bool (BinaryOpT::*)(const T &a, const T &b, int idx) const>           struct SFINAE5 {};
-    template <typename BinaryOpT, bool (BinaryOpT::*)(const T &a, const T &b, int idx)>                 struct SFINAE6 {};
-    template <typename BinaryOpT, bool (BinaryOpT::*)(T a, T b, int idx) const>                         struct SFINAE7 {};
-    template <typename BinaryOpT, bool (BinaryOpT::*)(T a, T b, int idx)>                               struct SFINAE8 {};
-/*
-    template <typename BinaryOpT> static char Test(SFINAE1<BinaryOpT, &BinaryOpT::operator()> *);
-    template <typename BinaryOpT> static char Test(SFINAE2<BinaryOpT, &BinaryOpT::operator()> *);
-    template <typename BinaryOpT> static char Test(SFINAE3<BinaryOpT, &BinaryOpT::operator()> *);
-    template <typename BinaryOpT> static char Test(SFINAE4<BinaryOpT, &BinaryOpT::operator()> *);
-*/
-    template <typename BinaryOpT> __host__ __device__ static char Test(SFINAE5<BinaryOpT, &BinaryOpT::operator()> *);
-    template <typename BinaryOpT> __host__ __device__ static char Test(SFINAE6<BinaryOpT, &BinaryOpT::operator()> *);
-    template <typename BinaryOpT> __host__ __device__ static char Test(SFINAE7<BinaryOpT, &BinaryOpT::operator()> *);
-    template <typename BinaryOpT> __host__ __device__ static char Test(SFINAE8<BinaryOpT, &BinaryOpT::operator()> *);
-
-    template <typename BinaryOpT> static int Test(...);
-
-public:
-
-    /// Whether the functor BinaryOp has a third <tt>unsigned int</tt> index param
-    static const bool HAS_PARAM = sizeof(Test<BinaryOp>(NULL)) == sizeof(char);
-};
-
-
-
-
-/******************************************************************************
- * Simple type traits utilities.
- *
- * For example:
- *     Traits<int>::CATEGORY             // SIGNED_INTEGER
- *     Traits<NullType>::NULL_TYPE       // true
- *     Traits<uint4>::CATEGORY           // NOT_A_NUMBER
- *     Traits<uint4>::PRIMITIVE;         // false
- *
- ******************************************************************************/
-
-/**
- * \brief Basic type traits categories
- */
-enum Category
-{
-    NOT_A_NUMBER,
-    SIGNED_INTEGER,
-    UNSIGNED_INTEGER,
-    FLOATING_POINT
-};
-
-
-/**
- * \brief Basic type traits
- */
-template <Category _CATEGORY, bool _PRIMITIVE, bool _NULL_TYPE, typename _UnsignedBits, typename T>
-struct BaseTraits
-{
-    /// Category
-    static const Category CATEGORY      = _CATEGORY;
-    enum
-    {
-        PRIMITIVE       = _PRIMITIVE,
-        NULL_TYPE       = _NULL_TYPE,
-    };
-};
-
-
-/**
- * Basic type traits (unsigned primitive specialization)
- */
-template <typename _UnsignedBits, typename T>
-struct BaseTraits<UNSIGNED_INTEGER, true, false, _UnsignedBits, T>
-{
-    typedef _UnsignedBits       UnsignedBits;
-
-    static const Category       CATEGORY    = UNSIGNED_INTEGER;
-    static const UnsignedBits   LOWEST_KEY  = UnsignedBits(0);
-    static const UnsignedBits   MAX_KEY     = UnsignedBits(-1);
-
-    enum
-    {
-        PRIMITIVE       = true,
-        NULL_TYPE       = false,
-    };
-
-
-    static __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key)
-    {
-        return key;
-    }
-
-    static __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key)
-    {
-        return key;
-    }
-
-    static __host__ __device__ __forceinline__ T Max()
-    {
-        UnsignedBits retval = MAX_KEY;
-        return reinterpret_cast<T&>(retval);
-    }
-
-    static __host__ __device__ __forceinline__ T Lowest()
-    {
-        UnsignedBits retval = LOWEST_KEY;
-        return reinterpret_cast<T&>(retval);
-    }
-};
-
-
-/**
- * Basic type traits (signed primitive specialization)
- */
-template <typename _UnsignedBits, typename T>
-struct BaseTraits<SIGNED_INTEGER, true, false, _UnsignedBits, T>
-{
-    typedef _UnsignedBits       UnsignedBits;
-
-    static const Category       CATEGORY    = SIGNED_INTEGER;
-    static const UnsignedBits   HIGH_BIT    = UnsignedBits(1) << ((sizeof(UnsignedBits) * 8) - 1);
-    static const UnsignedBits   LOWEST_KEY  = HIGH_BIT;
-    static const UnsignedBits   MAX_KEY     = UnsignedBits(-1) ^ HIGH_BIT;
-
-    enum
-    {
-        PRIMITIVE       = true,
-        NULL_TYPE       = false,
-    };
-
-    static __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key)
-    {
-        return key ^ HIGH_BIT;
-    };
-
-    static __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key)
-    {
-        return key ^ HIGH_BIT;
-    };
-
-    static __host__ __device__ __forceinline__ T Max()
-    {
-        UnsignedBits retval = MAX_KEY;
-        return reinterpret_cast<T&>(retval);
-    }
-
-    static __host__ __device__ __forceinline__ T Lowest()
-    {
-        UnsignedBits retval = LOWEST_KEY;
-        return reinterpret_cast<T&>(retval);
-    }
-};
-
-template <typename _T>
-struct FpLimits;
-
-template <>
-struct FpLimits<float>
-{
-    static __host__ __device__ __forceinline__ float Max() {
-        return FLT_MAX;
-    }
-
-    static __host__ __device__ __forceinline__ float Lowest() {
-        return FLT_MAX * float(-1);
-    }
-};
-
-template <>
-struct FpLimits<double>
-{
-    static __host__ __device__ __forceinline__ double Max() {
-        return DBL_MAX;
-    }
-
-    static __host__ __device__ __forceinline__ double Lowest() {
-        return DBL_MAX  * double(-1);
-    }
-};
-
-
-#if (__CUDACC_VER_MAJOR__ >= 9)
-template <>
-struct FpLimits<__half>
-{
-    static __host__ __device__ __forceinline__ __half Max() {
-        unsigned short max_word = 0x7BFF;
-        return reinterpret_cast<__half&>(max_word);
-    }
-
-    static __host__ __device__ __forceinline__ __half Lowest() {
-        unsigned short lowest_word = 0xFBFF;
-        return reinterpret_cast<__half&>(lowest_word);
-    }
-};
-#endif
-
-
-/**
- * Basic type traits (fp primitive specialization)
- */
-template <typename _UnsignedBits, typename T>
-struct BaseTraits<FLOATING_POINT, true, false, _UnsignedBits, T>
-{
-    typedef _UnsignedBits       UnsignedBits;
-
-    static const Category       CATEGORY    = FLOATING_POINT;
-    static const UnsignedBits   HIGH_BIT    = UnsignedBits(1) << ((sizeof(UnsignedBits) * 8) - 1);
-    static const UnsignedBits   LOWEST_KEY  = UnsignedBits(-1);
-    static const UnsignedBits   MAX_KEY     = UnsignedBits(-1) ^ HIGH_BIT;
-
-    enum
-    {
-        PRIMITIVE       = true,
-        NULL_TYPE       = false,
-    };
-
-    static __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key)
-    {
-        UnsignedBits mask = (key & HIGH_BIT) ? UnsignedBits(-1) : HIGH_BIT;
-        return key ^ mask;
-    };
-
-    static __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key)
-    {
-        UnsignedBits mask = (key & HIGH_BIT) ? HIGH_BIT : UnsignedBits(-1);
-        return key ^ mask;
-    };
-
-    static __host__ __device__ __forceinline__ T Max() {
-        return FpLimits<T>::Max();
-    }
-
-    static __host__ __device__ __forceinline__ T Lowest() {
-        return FpLimits<T>::Lowest();
-    }
-};
-
-
-/**
- * \brief Numeric type traits
- */
-template <typename T> struct NumericTraits :            BaseTraits<NOT_A_NUMBER, false, false, T, T> {};
-
-template <> struct NumericTraits<NullType> :            BaseTraits<NOT_A_NUMBER, false, true, NullType, NullType> {};
-
-template <> struct NumericTraits<char> :                BaseTraits<(std::numeric_limits<char>::is_signed) ? SIGNED_INTEGER : UNSIGNED_INTEGER, true, false, unsigned char, char> {};
-template <> struct NumericTraits<signed char> :         BaseTraits<SIGNED_INTEGER, true, false, unsigned char, signed char> {};
-template <> struct NumericTraits<short> :               BaseTraits<SIGNED_INTEGER, true, false, unsigned short, short> {};
-template <> struct NumericTraits<int> :                 BaseTraits<SIGNED_INTEGER, true, false, unsigned int, int> {};
-template <> struct NumericTraits<long> :                BaseTraits<SIGNED_INTEGER, true, false, unsigned long, long> {};
-template <> struct NumericTraits<long long> :           BaseTraits<SIGNED_INTEGER, true, false, unsigned long long, long long> {};
-
-template <> struct NumericTraits<unsigned char> :       BaseTraits<UNSIGNED_INTEGER, true, false, unsigned char, unsigned char> {};
-template <> struct NumericTraits<unsigned short> :      BaseTraits<UNSIGNED_INTEGER, true, false, unsigned short, unsigned short> {};
-template <> struct NumericTraits<unsigned int> :        BaseTraits<UNSIGNED_INTEGER, true, false, unsigned int, unsigned int> {};
-template <> struct NumericTraits<unsigned long> :       BaseTraits<UNSIGNED_INTEGER, true, false, unsigned long, unsigned long> {};
-template <> struct NumericTraits<unsigned long long> :  BaseTraits<UNSIGNED_INTEGER, true, false, unsigned long long, unsigned long long> {};
-
-template <> struct NumericTraits<float> :               BaseTraits<FLOATING_POINT, true, false, unsigned int, float> {};
-template <> struct NumericTraits<double> :              BaseTraits<FLOATING_POINT, true, false, unsigned long long, double> {};
-#if (__CUDACC_VER_MAJOR__ >= 9)
-    template <> struct NumericTraits<__half> :          BaseTraits<FLOATING_POINT, true, false, unsigned short, __half> {};
-#endif
-
-template <> struct NumericTraits<bool> :                BaseTraits<UNSIGNED_INTEGER, true, false, typename UnitWord<bool>::VolatileWord, bool> {};
-
-
-
-/**
- * \brief Type traits
- */
-template <typename T>
-struct Traits : NumericTraits<typename RemoveQualifiers<T>::Type> {};
-
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-/** @} */       // end group UtilModule
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_shfl.cuh b/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_shfl.cuh
deleted file mode 100644
index c92765297..000000000
--- a/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_shfl.cuh
+++ /dev/null
@@ -1,551 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::WarpReduceShfl provides SHFL-based variants of parallel reduction of items partitioned across a CUDA thread warp.
- */
-
-#pragma once
-
-#include "../../thread/thread_operators.cuh"
-#include "../../util_ptx.cuh"
-#include "../../util_type.cuh"
-#include "../../util_macro.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \brief WarpReduceShfl provides SHFL-based variants of parallel reduction of items partitioned across a CUDA thread warp.
- *
- * LOGICAL_WARP_THREADS must be a power-of-two
- */
-template <
-    typename    T,                      ///< Data type being reduced
-    int         LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
-    int         PTX_ARCH>               ///< The PTX compute capability for which to to specialize this collective
-struct WarpReduceShfl
-{
-    //---------------------------------------------------------------------
-    // Constants and type definitions
-    //---------------------------------------------------------------------
-
-    enum
-    {
-        /// Whether the logical warp size and the PTX warp size coincide
-        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
-
-        /// The number of warp reduction steps
-        STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,
-
-        /// Number of logical warps in a PTX warp
-        LOGICAL_WARPS = CUB_WARP_THREADS(PTX_ARCH) / LOGICAL_WARP_THREADS,
-    };
-
-    template <typename S>
-    struct IsInteger
-    {
-        enum {
-            ///Whether the data type is a small (32b or less) integer for which we can use a single SFHL instruction per exchange
-            IS_SMALL_UNSIGNED = (Traits<S>::CATEGORY == UNSIGNED_INTEGER) && (sizeof(S) <= sizeof(unsigned int))
-        };
-    };
-
-
-    // Creates a mask where the last thread in each logical warp is set
-    template <int WARP, int WARPS>
-    struct LastLaneMask
-    {
-        enum {
-            BASE_MASK   = 1 << (LOGICAL_WARP_THREADS - 1),
-            MASK        = (LastLaneMask<WARP + 1, WARPS>::MASK << LOGICAL_WARP_THREADS) | BASE_MASK,
-        };
-    };
-
-    // Creates a mask where the last thread in each logical warp is set
-    template <int WARP>
-    struct LastLaneMask<WARP, WARP>
-    {
-        enum {
-            MASK        = 1 << (LOGICAL_WARP_THREADS - 1),
-        };
-    };
-
-
-
-    /// Shared memory storage layout type
-    typedef NullType TempStorage;
-
-
-    //---------------------------------------------------------------------
-    // Thread fields
-    //---------------------------------------------------------------------
-
-
-    unsigned int lane_id;
-
-    unsigned int member_mask;
-
-    //---------------------------------------------------------------------
-    // Construction
-    //---------------------------------------------------------------------
-
-    /// Constructor
-    __device__ __forceinline__ WarpReduceShfl(
-        TempStorage &/*temp_storage*/)
-    :
-        lane_id(LaneId()),
-
-        member_mask((0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << ((IS_ARCH_WARP) ?
-            0 : // arch-width subwarps need not be tiled within the arch-warp
-            ((lane_id / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS)))
-    {}
-
-
-    //---------------------------------------------------------------------
-    // Reduction steps
-    //---------------------------------------------------------------------
-
-    /// Reduction (specialized for summation across uint32 types)
-    __device__ __forceinline__ unsigned int ReduceStep(
-        unsigned int    input,              ///< [in] Calling thread's input item.
-        cub::Sum        /*reduction_op*/,   ///< [in] Binary reduction operator
-        int             last_lane,          ///< [in] Index of last lane in segment
-        int             offset)             ///< [in] Up-offset to pull from
-    {
-        unsigned int output;
-
-        // Use predicate set from SHFL to guard against invalid peers
-#ifdef CUB_USE_COOPERATIVE_GROUPS
-        asm volatile(
-            "{"
-            "  .reg .u32 r0;"
-            "  .reg .pred p;"
-            "  shfl.sync.down.b32 r0|p, %1, %2, %3, %5;"
-            "  @p add.u32 r0, r0, %4;"
-            "  mov.u32 %0, r0;"
-            "}"
-            : "=r"(output) : "r"(input), "r"(offset), "r"(last_lane), "r"(input), "r"(member_mask));
-#else
-        asm volatile(
-            "{"
-            "  .reg .u32 r0;"
-            "  .reg .pred p;"
-            "  shfl.down.b32 r0|p, %1, %2, %3;"
-            "  @p add.u32 r0, r0, %4;"
-            "  mov.u32 %0, r0;"
-            "}"
-            : "=r"(output) : "r"(input), "r"(offset), "r"(last_lane), "r"(input));
-#endif
-
-        return output;
-    }
-
-
-    /// Reduction (specialized for summation across fp32 types)
-    __device__ __forceinline__ float ReduceStep(
-        float           input,              ///< [in] Calling thread's input item.
-        cub::Sum        /*reduction_op*/,   ///< [in] Binary reduction operator
-        int             last_lane,          ///< [in] Index of last lane in segment
-        int             offset)             ///< [in] Up-offset to pull from
-    {
-        float output;
-
-        // Use predicate set from SHFL to guard against invalid peers
-#ifdef CUB_USE_COOPERATIVE_GROUPS
-        asm volatile(
-            "{"
-            "  .reg .f32 r0;"
-            "  .reg .pred p;"
-            "  shfl.sync.down.b32 r0|p, %1, %2, %3, %5;"
-            "  @p add.f32 r0, r0, %4;"
-            "  mov.f32 %0, r0;"
-            "}"
-            : "=f"(output) : "f"(input), "r"(offset), "r"(last_lane), "f"(input), "r"(member_mask));
-#else
-        asm volatile(
-            "{"
-            "  .reg .f32 r0;"
-            "  .reg .pred p;"
-            "  shfl.down.b32 r0|p, %1, %2, %3;"
-            "  @p add.f32 r0, r0, %4;"
-            "  mov.f32 %0, r0;"
-            "}"
-            : "=f"(output) : "f"(input), "r"(offset), "r"(last_lane), "f"(input));
-#endif
-
-        return output;
-    }
-
-
-    /// Reduction (specialized for summation across unsigned long long types)
-    __device__ __forceinline__ unsigned long long ReduceStep(
-        unsigned long long  input,              ///< [in] Calling thread's input item.
-        cub::Sum            /*reduction_op*/,   ///< [in] Binary reduction operator
-        int                 last_lane,          ///< [in] Index of last lane in segment
-        int                 offset)             ///< [in] Up-offset to pull from
-    {
-        unsigned long long output;
-
-#ifdef CUB_USE_COOPERATIVE_GROUPS
-        asm volatile(
-            "{"
-            "  .reg .u32 lo;"
-            "  .reg .u32 hi;"
-            "  .reg .pred p;"
-            "  mov.b64 {lo, hi}, %1;"
-            "  shfl.sync.down.b32 lo|p, lo, %2, %3, %4;"
-            "  shfl.sync.down.b32 hi|p, hi, %2, %3, %4;"
-            "  mov.b64 %0, {lo, hi};"
-            "  @p add.u64 %0, %0, %1;"
-            "}"
-            : "=l"(output) : "l"(input), "r"(offset), "r"(last_lane), "r"(member_mask));
-#else
-        asm volatile(
-            "{"
-            "  .reg .u32 lo;"
-            "  .reg .u32 hi;"
-            "  .reg .pred p;"
-            "  mov.b64 {lo, hi}, %1;"
-            "  shfl.down.b32 lo|p, lo, %2, %3;"
-            "  shfl.down.b32 hi|p, hi, %2, %3;"
-            "  mov.b64 %0, {lo, hi};"
-            "  @p add.u64 %0, %0, %1;"
-            "}"
-            : "=l"(output) : "l"(input), "r"(offset), "r"(last_lane));
-#endif
-
-        return output;
-    }
-
-
-    /// Reduction (specialized for summation across long long types)
-    __device__ __forceinline__ long long ReduceStep(
-        long long           input,              ///< [in] Calling thread's input item.
-        cub::Sum            /*reduction_op*/,   ///< [in] Binary reduction operator
-        int                 last_lane,          ///< [in] Index of last lane in segment
-        int                 offset)             ///< [in] Up-offset to pull from
-    {
-        long long output;
-
-        // Use predicate set from SHFL to guard against invalid peers
-#ifdef CUB_USE_COOPERATIVE_GROUPS
-        asm volatile(
-            "{"
-            "  .reg .u32 lo;"
-            "  .reg .u32 hi;"
-            "  .reg .pred p;"
-            "  mov.b64 {lo, hi}, %1;"
-            "  shfl.sync.down.b32 lo|p, lo, %2, %3, %4;"
-            "  shfl.sync.down.b32 hi|p, hi, %2, %3, %4;"
-            "  mov.b64 %0, {lo, hi};"
-            "  @p add.s64 %0, %0, %1;"
-            "}"
-            : "=l"(output) : "l"(input), "r"(offset), "r"(last_lane), "r"(member_mask));
-#else
-        asm volatile(
-            "{"
-            "  .reg .u32 lo;"
-            "  .reg .u32 hi;"
-            "  .reg .pred p;"
-            "  mov.b64 {lo, hi}, %1;"
-            "  shfl.down.b32 lo|p, lo, %2, %3;"
-            "  shfl.down.b32 hi|p, hi, %2, %3;"
-            "  mov.b64 %0, {lo, hi};"
-            "  @p add.s64 %0, %0, %1;"
-            "}"
-            : "=l"(output) : "l"(input), "r"(offset), "r"(last_lane));
-#endif
-
-        return output;
-    }
-
-
-    /// Reduction (specialized for summation across double types)
-    __device__ __forceinline__ double ReduceStep(
-        double              input,              ///< [in] Calling thread's input item.
-        cub::Sum            /*reduction_op*/,   ///< [in] Binary reduction operator
-        int                 last_lane,          ///< [in] Index of last lane in segment
-        int                 offset)             ///< [in] Up-offset to pull from
-    {
-        double output;
-
-        // Use predicate set from SHFL to guard against invalid peers
-#ifdef CUB_USE_COOPERATIVE_GROUPS
-        asm volatile(
-            "{"
-            "  .reg .u32 lo;"
-            "  .reg .u32 hi;"
-            "  .reg .pred p;"
-            "  .reg .f64 r0;"
-            "  mov.b64 %0, %1;"
-            "  mov.b64 {lo, hi}, %1;"
-            "  shfl.sync.down.b32 lo|p, lo, %2, %3, %4;"
-            "  shfl.sync.down.b32 hi|p, hi, %2, %3, %4;"
-            "  mov.b64 r0, {lo, hi};"
-            "  @p add.f64 %0, %0, r0;"
-            "}"
-            : "=d"(output) : "d"(input), "r"(offset), "r"(last_lane), "r"(member_mask));
-#else
-        asm volatile(
-            "{"
-            "  .reg .u32 lo;"
-            "  .reg .u32 hi;"
-            "  .reg .pred p;"
-            "  .reg .f64 r0;"
-            "  mov.b64 %0, %1;"
-            "  mov.b64 {lo, hi}, %1;"
-            "  shfl.down.b32 lo|p, lo, %2, %3;"
-            "  shfl.down.b32 hi|p, hi, %2, %3;"
-            "  mov.b64 r0, {lo, hi};"
-            "  @p add.f64 %0, %0, r0;"
-            "}"
-            : "=d"(output) : "d"(input), "r"(offset), "r"(last_lane));
-#endif
-
-        return output;
-    }
-
-
-    /// Reduction (specialized for swizzled ReduceByKeyOp<cub::Sum> across KeyValuePair<KeyT, ValueT> types)
-    template <typename ValueT, typename KeyT>
-    __device__ __forceinline__ KeyValuePair<KeyT, ValueT> ReduceStep(
-        KeyValuePair<KeyT, ValueT>                  input,              ///< [in] Calling thread's input item.
-        SwizzleScanOp<ReduceByKeyOp<cub::Sum> >     /*reduction_op*/,       ///< [in] Binary reduction operator
-        int                                         last_lane,          ///< [in] Index of last lane in segment
-        int                                         offset)             ///< [in] Up-offset to pull from
-    {
-        KeyValuePair<KeyT, ValueT> output;
-
-        KeyT other_key = ShuffleDown(input.key, offset, last_lane, member_mask);
-        
-        output.key = input.key;
-        output.value = ReduceStep(
-            input.value, 
-            cub::Sum(), 
-            last_lane, 
-            offset, 
-            Int2Type<IsInteger<ValueT>::IS_SMALL_UNSIGNED>());
-
-        if (input.key != other_key)
-            output.value = input.value;
-
-        return output;
-    }
-
-
-
-    /// Reduction (specialized for swizzled ReduceBySegmentOp<cub::Sum> across KeyValuePair<OffsetT, ValueT> types)
-    template <typename ValueT, typename OffsetT>
-    __device__ __forceinline__ KeyValuePair<OffsetT, ValueT> ReduceStep(
-        KeyValuePair<OffsetT, ValueT>                 input,              ///< [in] Calling thread's input item.
-        SwizzleScanOp<ReduceBySegmentOp<cub::Sum> >   /*reduction_op*/,   ///< [in] Binary reduction operator
-        int                                           last_lane,          ///< [in] Index of last lane in segment
-        int                                           offset)             ///< [in] Up-offset to pull from
-    {
-        KeyValuePair<OffsetT, ValueT> output;
-
-        output.value = ReduceStep(input.value, cub::Sum(), last_lane, offset, Int2Type<IsInteger<ValueT>::IS_SMALL_UNSIGNED>());
-        output.key = ReduceStep(input.key, cub::Sum(), last_lane, offset, Int2Type<IsInteger<OffsetT>::IS_SMALL_UNSIGNED>());
-
-        if (input.key > 0)
-            output.value = input.value;
-
-        return output;
-    }
-
-
-    /// Reduction step (generic)
-    template <typename _T, typename ReductionOp>
-    __device__ __forceinline__ _T ReduceStep(
-        _T                  input,              ///< [in] Calling thread's input item.
-        ReductionOp         reduction_op,       ///< [in] Binary reduction operator
-        int                 last_lane,          ///< [in] Index of last lane in segment
-        int                 offset)             ///< [in] Up-offset to pull from
-    {
-        _T output = input;
-
-        _T temp = ShuffleDown(output, offset, last_lane, member_mask);
-
-        // Perform reduction op if valid
-        if (offset + lane_id <= last_lane)
-            output = reduction_op(input, temp);
-
-        return output;
-    }
-
-
-    /// Reduction step (specialized for small unsigned integers size 32b or less)
-    template <typename _T, typename ReductionOp>
-    __device__ __forceinline__ _T ReduceStep(
-        _T              input,                  ///< [in] Calling thread's input item.
-        ReductionOp     reduction_op,           ///< [in] Binary reduction operator
-        int             last_lane,              ///< [in] Index of last lane in segment
-        int             offset,                 ///< [in] Up-offset to pull from
-        Int2Type<true>  /*is_small_unsigned*/)  ///< [in] Marker type indicating whether T is a small unsigned integer
-    {
-        return ReduceStep(input, reduction_op, last_lane, offset);
-    }
-
-
-    /// Reduction step (specialized for types other than small unsigned integers size 32b or less)
-    template <typename _T, typename ReductionOp>
-    __device__ __forceinline__ _T ReduceStep(
-        _T              input,                  ///< [in] Calling thread's input item.
-        ReductionOp     reduction_op,           ///< [in] Binary reduction operator
-        int             last_lane,              ///< [in] Index of last lane in segment
-        int             offset,                 ///< [in] Up-offset to pull from
-        Int2Type<false> /*is_small_unsigned*/)  ///< [in] Marker type indicating whether T is a small unsigned integer
-    {
-        return ReduceStep(input, reduction_op, last_lane, offset);
-    }
-
-
-    //---------------------------------------------------------------------
-    // Templated inclusive scan iteration
-    //---------------------------------------------------------------------
-
-    template <typename ReductionOp, int STEP>
-    __device__ __forceinline__ void ReduceStep(
-        T&              input,              ///< [in] Calling thread's input item.
-        ReductionOp     reduction_op,       ///< [in] Binary reduction operator
-        int             last_lane,          ///< [in] Index of last lane in segment
-        Int2Type<STEP>  /*step*/)
-    {
-        input = ReduceStep(input, reduction_op, last_lane, 1 << STEP, Int2Type<IsInteger<T>::IS_SMALL_UNSIGNED>());
-
-        ReduceStep(input, reduction_op, last_lane, Int2Type<STEP + 1>());
-    }
-
-    template <typename ReductionOp>
-    __device__ __forceinline__ void ReduceStep(
-        T&              /*input*/,              ///< [in] Calling thread's input item.
-        ReductionOp     /*reduction_op*/,       ///< [in] Binary reduction operator
-        int             /*last_lane*/,          ///< [in] Index of last lane in segment
-        Int2Type<STEPS> /*step*/)
-    {}
-
-
-    //---------------------------------------------------------------------
-    // Reduction operations
-    //---------------------------------------------------------------------
-
-    /// Reduction
-    template <
-        bool            ALL_LANES_VALID,        ///< Whether all lanes in each warp are contributing a valid fold of items
-        int             FOLDED_ITEMS_PER_LANE,  ///< Number of items folded into each lane
-        typename        ReductionOp>
-    __device__ __forceinline__ T Reduce(
-        T               input,                  ///< [in] Calling thread's input
-        int             folded_items_per_warp,  ///< [in] Total number of valid items folded into each logical warp
-        ReductionOp     reduction_op)           ///< [in] Binary reduction operator
-    {
-        // Get the lane of the first and last thread in the logical warp
-        int first_thread   = 0;
-        int last_thread    = LOGICAL_WARP_THREADS - 1;
-        if (!IS_ARCH_WARP)
-        {
-            first_thread = lane_id & (~(LOGICAL_WARP_THREADS - 1));
-            last_thread |= lane_id;
-        }
-
-        // Common case is FOLDED_ITEMS_PER_LANE = 1 (or a multiple of 32)
-        int lanes_with_valid_data = (folded_items_per_warp > 0 ? (folded_items_per_warp - 1) / FOLDED_ITEMS_PER_LANE : 0);
-
-        // Get the last valid lane
-        int last_lane = (ALL_LANES_VALID) ?
-            last_thread :
-            CUB_MIN(last_thread, first_thread + lanes_with_valid_data);
-
-        T output = input;
-
-//        // Iterate reduction steps
-//        #pragma unroll
-//        for (int STEP = 0; STEP < STEPS; STEP++)
-//        {
-//            output = ReduceStep(output, reduction_op, last_lane, 1 << STEP, Int2Type<IsInteger<T>::IS_SMALL_UNSIGNED>());
-//        }
-
-        // Template-iterate reduction steps
-        ReduceStep(output, reduction_op, last_lane, Int2Type<0>());
-
-        return output;
-    }
-
-
-    /// Segmented reduction
-    template <
-        bool            HEAD_SEGMENTED,     ///< Whether flags indicate a segment-head or a segment-tail
-        typename        FlagT,
-        typename        ReductionOp>
-    __device__ __forceinline__ T SegmentedReduce(
-        T               input,              ///< [in] Calling thread's input
-        FlagT           flag,               ///< [in] Whether or not the current lane is a segment head/tail
-        ReductionOp     reduction_op)       ///< [in] Binary reduction operator
-    {
-        // Get the start flags for each thread in the warp.
-        int warp_flags = WARP_BALLOT(flag, member_mask);
-
-        // Convert to tail-segmented
-        if (HEAD_SEGMENTED)
-            warp_flags >>= 1;
-
-        // Mask in the last lanes of each logical warp
-        warp_flags |= LastLaneMask<1, LOGICAL_WARPS>::MASK;
-
-        // Mask out the bits below the current thread
-        warp_flags &= LaneMaskGe();
-
-        // Find the next set flag
-        int last_lane = __clz(__brev(warp_flags));
-
-        T output = input;
-
-//        // Iterate reduction steps
-//        #pragma unroll
-//        for (int STEP = 0; STEP < STEPS; STEP++)
-//        {
-//            output = ReduceStep(output, reduction_op, last_lane, 1 << STEP, Int2Type<IsInteger<T>::IS_SMALL_UNSIGNED>());
-//        }
-
-        // Template-iterate reduction steps
-        ReduceStep(output, reduction_op, last_lane, Int2Type<0>());
-
-        return output;
-    }
-};
-
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_smem.cuh b/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_smem.cuh
deleted file mode 100644
index 4325ca0c8..000000000
--- a/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_smem.cuh
+++ /dev/null
@@ -1,375 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::WarpReduceSmem provides smem-based variants of parallel reduction of items partitioned across a CUDA thread warp.
- */
-
-#pragma once
-
-#include "../../thread/thread_operators.cuh"
-#include "../../thread/thread_load.cuh"
-#include "../../thread/thread_store.cuh"
-#include "../../util_type.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \brief WarpReduceSmem provides smem-based variants of parallel reduction of items partitioned across a CUDA thread warp.
- */
-template <
-    typename    T,                      ///< Data type being reduced
-    int         LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
-    int         PTX_ARCH>               ///< The PTX compute capability for which to to specialize this collective
-struct WarpReduceSmem
-{
-    /******************************************************************************
-     * Constants and type definitions
-     ******************************************************************************/
-
-    enum
-    {
-        /// Whether the logical warp size and the PTX warp size coincide
-        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
-
-        /// Whether the logical warp size is a power-of-two
-        IS_POW_OF_TWO = PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE,
-
-        /// The number of warp scan steps
-        STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,
-
-        /// The number of threads in half a warp
-        HALF_WARP_THREADS = 1 << (STEPS - 1),
-
-        /// The number of shared memory elements per warp
-        WARP_SMEM_ELEMENTS =  LOGICAL_WARP_THREADS + HALF_WARP_THREADS,
-
-        /// FlagT status (when not using ballot)
-        UNSET   = 0x0,  // Is initially unset
-        SET     = 0x1,  // Is initially set
-        SEEN    = 0x2,  // Has seen another head flag from a successor peer
-    };
-
-    /// Shared memory flag type
-    typedef unsigned char SmemFlag;
-
-    /// Shared memory storage layout type (1.5 warps-worth of elements for each warp)
-    struct _TempStorage
-    {
-        T           reduce[WARP_SMEM_ELEMENTS];
-        SmemFlag    flags[WARP_SMEM_ELEMENTS];
-    };
-
-    // Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    _TempStorage    &temp_storage;
-    unsigned int    lane_id;
-    unsigned int    member_mask;
-
-
-    /******************************************************************************
-     * Construction
-     ******************************************************************************/
-
-    /// Constructor
-    __device__ __forceinline__ WarpReduceSmem(
-        TempStorage     &temp_storage)
-    :
-        temp_storage(temp_storage.Alias()),
-
-        lane_id(IS_ARCH_WARP ?
-            LaneId() :
-            LaneId() % LOGICAL_WARP_THREADS),
-
-        member_mask((0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << ((IS_ARCH_WARP || !IS_POW_OF_TWO ) ?
-            0 : // arch-width and non-power-of-two subwarps cannot be tiled with the arch-warp
-            ((LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS)))
-    {}
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-    //---------------------------------------------------------------------
-    // Regular reduction
-    //---------------------------------------------------------------------
-
-    /**
-     * Reduction step
-     */
-    template <
-        bool                ALL_LANES_VALID,        ///< Whether all lanes in each warp are contributing a valid fold of items
-        int                 FOLDED_ITEMS_PER_LANE,  ///< Number of items folded into each lane
-        typename            ReductionOp,
-        int                 STEP>
-    __device__ __forceinline__ T ReduceStep(
-        T                   input,                  ///< [in] Calling thread's input
-        int                 folded_items_per_warp,  ///< [in] Total number of valid items folded into each logical warp
-        ReductionOp         reduction_op,           ///< [in] Reduction operator
-        Int2Type<STEP>      /*step*/)
-    {
-        const int OFFSET = 1 << STEP;
-
-        // Share input through buffer
-        ThreadStore<STORE_VOLATILE>(&temp_storage.reduce[lane_id], input);
-
-        WARP_SYNC(member_mask);
-
-        // Update input if peer_addend is in range
-        if ((ALL_LANES_VALID && IS_POW_OF_TWO) || ((lane_id + OFFSET) * FOLDED_ITEMS_PER_LANE < folded_items_per_warp))
-        {
-            T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage.reduce[lane_id + OFFSET]);
-            input = reduction_op(input, peer_addend);
-        }
-
-        WARP_SYNC(member_mask);
-
-        return ReduceStep<ALL_LANES_VALID, FOLDED_ITEMS_PER_LANE>(input, folded_items_per_warp, reduction_op, Int2Type<STEP + 1>());
-    }
-
-
-    /**
-     * Reduction step (terminate)
-     */
-    template <
-        bool                ALL_LANES_VALID,            ///< Whether all lanes in each warp are contributing a valid fold of items
-        int                 FOLDED_ITEMS_PER_LANE,      ///< Number of items folded into each lane
-        typename            ReductionOp>
-    __device__ __forceinline__ T ReduceStep(
-        T                   input,                      ///< [in] Calling thread's input
-        int                 /*folded_items_per_warp*/,  ///< [in] Total number of valid items folded into each logical warp
-        ReductionOp         /*reduction_op*/,           ///< [in] Reduction operator
-        Int2Type<STEPS>     /*step*/)
-    {
-        return input;
-    }
-
-
-    //---------------------------------------------------------------------
-    // Segmented reduction
-    //---------------------------------------------------------------------
-
-
-    /**
-     * Ballot-based segmented reduce
-     */
-    template <
-        bool            HEAD_SEGMENTED,     ///< Whether flags indicate a segment-head or a segment-tail
-        typename        FlagT,
-        typename        ReductionOp>
-    __device__ __forceinline__ T SegmentedReduce(
-        T               input,                  ///< [in] Calling thread's input
-        FlagT           flag,                   ///< [in] Whether or not the current lane is a segment head/tail
-        ReductionOp     reduction_op,           ///< [in] Reduction operator
-        Int2Type<true>  /*has_ballot*/)         ///< [in] Marker type for whether the target arch has ballot functionality
-    {
-        // Get the start flags for each thread in the warp.
-        int warp_flags = WARP_BALLOT(flag, member_mask);
-
-        if (!HEAD_SEGMENTED)
-            warp_flags <<= 1;
-
-        // Keep bits above the current thread.
-        warp_flags &= LaneMaskGt();
-
-        // Accommodate packing of multiple logical warps in a single physical warp
-        if (!IS_ARCH_WARP)
-        {
-            warp_flags >>= (LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS;
-        }
-
-        // Find next flag
-        int next_flag = __clz(__brev(warp_flags));
-
-        // Clip the next segment at the warp boundary if necessary
-        if (LOGICAL_WARP_THREADS != 32)
-            next_flag = CUB_MIN(next_flag, LOGICAL_WARP_THREADS);
-
-        #pragma unroll
-        for (int STEP = 0; STEP < STEPS; STEP++)
-        {
-            const int OFFSET = 1 << STEP;
-
-            // Share input into buffer
-            ThreadStore<STORE_VOLATILE>(&temp_storage.reduce[lane_id], input);
-
-            WARP_SYNC(member_mask);
-
-            // Update input if peer_addend is in range
-            if (OFFSET + lane_id < next_flag)
-            {
-                T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage.reduce[lane_id + OFFSET]);
-                input = reduction_op(input, peer_addend);
-            }
-
-            WARP_SYNC(member_mask);
-        }
-
-        return input;
-    }
-
-
-    /**
-     * Smem-based segmented reduce
-     */
-    template <
-        bool            HEAD_SEGMENTED,     ///< Whether flags indicate a segment-head or a segment-tail
-        typename        FlagT,
-        typename        ReductionOp>
-    __device__ __forceinline__ T SegmentedReduce(
-        T               input,                  ///< [in] Calling thread's input
-        FlagT           flag,                   ///< [in] Whether or not the current lane is a segment head/tail
-        ReductionOp     reduction_op,           ///< [in] Reduction operator
-        Int2Type<false> /*has_ballot*/)         ///< [in] Marker type for whether the target arch has ballot functionality
-    {
-        enum
-        {
-            UNSET   = 0x0,  // Is initially unset
-            SET     = 0x1,  // Is initially set
-            SEEN    = 0x2,  // Has seen another head flag from a successor peer
-        };
-
-        // Alias flags onto shared data storage
-        volatile SmemFlag *flag_storage = temp_storage.flags;
-
-        SmemFlag flag_status = (flag) ? SET : UNSET;
-
-        for (int STEP = 0; STEP < STEPS; STEP++)
-        {
-            const int OFFSET = 1 << STEP;
-
-            // Share input through buffer
-            ThreadStore<STORE_VOLATILE>(&temp_storage.reduce[lane_id], input);
-
-            WARP_SYNC(member_mask);
-
-            // Get peer from buffer
-            T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage.reduce[lane_id + OFFSET]);
-
-            WARP_SYNC(member_mask);
-
-            // Share flag through buffer
-            flag_storage[lane_id] = flag_status;
-
-            // Get peer flag from buffer
-            SmemFlag peer_flag_status = flag_storage[lane_id + OFFSET];
-
-            // Update input if peer was in range
-            if (lane_id < LOGICAL_WARP_THREADS - OFFSET)
-            {
-                if (HEAD_SEGMENTED)
-                {
-                    // Head-segmented
-                    if ((flag_status & SEEN) == 0)
-                    {
-                        // Has not seen a more distant head flag
-                        if (peer_flag_status & SET)
-                        {
-                            // Has now seen a head flag
-                            flag_status |= SEEN;
-                        }
-                        else
-                        {
-                            // Peer is not a head flag: grab its count
-                            input = reduction_op(input, peer_addend);
-                        }
-
-                        // Update seen status to include that of peer
-                        flag_status |= (peer_flag_status & SEEN);
-                    }
-                }
-                else
-                {
-                    // Tail-segmented.  Simply propagate flag status
-                    if (!flag_status)
-                    {
-                        input = reduction_op(input, peer_addend);
-                        flag_status |= peer_flag_status;
-                    }
-
-                }
-            }
-        }
-
-        return input;
-    }
-
-
-    /******************************************************************************
-     * Interface
-     ******************************************************************************/
-
-    /**
-     * Reduction
-     */
-    template <
-        bool                ALL_LANES_VALID,        ///< Whether all lanes in each warp are contributing a valid fold of items
-        int                 FOLDED_ITEMS_PER_LANE,  ///< Number of items folded into each lane
-        typename            ReductionOp>
-    __device__ __forceinline__ T Reduce(
-        T                   input,                  ///< [in] Calling thread's input
-        int                 folded_items_per_warp,  ///< [in] Total number of valid items folded into each logical warp
-        ReductionOp         reduction_op)           ///< [in] Reduction operator
-    {
-        return ReduceStep<ALL_LANES_VALID, FOLDED_ITEMS_PER_LANE>(input, folded_items_per_warp, reduction_op, Int2Type<0>());
-    }
-
-
-    /**
-     * Segmented reduction
-     */
-    template <
-        bool            HEAD_SEGMENTED,     ///< Whether flags indicate a segment-head or a segment-tail
-        typename        FlagT,
-        typename        ReductionOp>
-    __device__ __forceinline__ T SegmentedReduce(
-        T               input,              ///< [in] Calling thread's input
-        FlagT            flag,               ///< [in] Whether or not the current lane is a segment head/tail
-        ReductionOp     reduction_op)       ///< [in] Reduction operator
-    {
-        return SegmentedReduce<HEAD_SEGMENTED>(input, flag, reduction_op, Int2Type<(PTX_ARCH >= 200)>());
-    }
-
-
-};
-
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_shfl.cuh b/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_shfl.cuh
deleted file mode 100644
index d5f40161b..000000000
--- a/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_shfl.cuh
+++ /dev/null
@@ -1,656 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::WarpScanShfl provides SHFL-based variants of parallel prefix scan of items partitioned across a CUDA thread warp.
- */
-
-#pragma once
-
-#include "../../thread/thread_operators.cuh"
-#include "../../util_type.cuh"
-#include "../../util_ptx.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \brief WarpScanShfl provides SHFL-based variants of parallel prefix scan of items partitioned across a CUDA thread warp.
- *
- * LOGICAL_WARP_THREADS must be a power-of-two
- */
-template <
-    typename    T,                      ///< Data type being scanned
-    int         LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
-    int         PTX_ARCH>               ///< The PTX compute capability for which to to specialize this collective
-struct WarpScanShfl
-{
-    //---------------------------------------------------------------------
-    // Constants and type definitions
-    //---------------------------------------------------------------------
-
-    enum
-    {
-        /// Whether the logical warp size and the PTX warp size coincide
-        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
-
-        /// The number of warp scan steps
-        STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,
-
-        /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up
-        SHFL_C = ((0xFFFFFFFFU << STEPS) & 31) << 8,
-    };
-
-    template <typename S>
-    struct IntegerTraits
-    {
-        enum {
-            ///Whether the data type is a small (32b or less) integer for which we can use a single SFHL instruction per exchange
-            IS_SMALL_UNSIGNED = (Traits<S>::CATEGORY == UNSIGNED_INTEGER) && (sizeof(S) <= sizeof(unsigned int))
-        };
-    };
-
-    /// Shared memory storage layout type
-    struct TempStorage {};
-
-
-    //---------------------------------------------------------------------
-    // Thread fields
-    //---------------------------------------------------------------------
-
-    unsigned int lane_id;
-
-    unsigned int member_mask;
-
-    //---------------------------------------------------------------------
-    // Construction
-    //---------------------------------------------------------------------
-
-    /// Constructor
-    __device__ __forceinline__ WarpScanShfl(
-        TempStorage &/*temp_storage*/)
-    :
-        lane_id(LaneId()),
-
-        member_mask((0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << ((IS_ARCH_WARP) ?
-            0 : // arch-width subwarps need not be tiled within the arch-warp
-            ((lane_id / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS)))
-    {}
-
-
-    //---------------------------------------------------------------------
-    // Inclusive scan steps
-    //---------------------------------------------------------------------
-
-    /// Inclusive prefix scan step (specialized for summation across int32 types)
-    __device__ __forceinline__ int InclusiveScanStep(
-        int             input,              ///< [in] Calling thread's input item.
-        cub::Sum        /*scan_op*/,        ///< [in] Binary scan operator
-        int             first_lane,         ///< [in] Index of first lane in segment
-        int             offset)             ///< [in] Up-offset to pull from
-    {
-        int output;
-        int shfl_c = first_lane | SHFL_C;   // Shuffle control (mask and first-lane)
-
-        // Use predicate set from SHFL to guard against invalid peers
-#ifdef CUB_USE_COOPERATIVE_GROUPS
-        asm volatile(
-            "{"
-            "  .reg .s32 r0;"
-            "  .reg .pred p;"
-            "  shfl.sync.up.b32 r0|p, %1, %2, %3, %5;"
-            "  @p add.s32 r0, r0, %4;"
-            "  mov.s32 %0, r0;"
-            "}"
-            : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input), "r"(member_mask));
-#else
-        asm volatile(
-            "{"
-            "  .reg .s32 r0;"
-            "  .reg .pred p;"
-            "  shfl.up.b32 r0|p, %1, %2, %3;"
-            "  @p add.s32 r0, r0, %4;"
-            "  mov.s32 %0, r0;"
-            "}"
-            : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input));
-#endif
-
-        return output;
-    }
-
-    /// Inclusive prefix scan step (specialized for summation across uint32 types)
-    __device__ __forceinline__ unsigned int InclusiveScanStep(
-        unsigned int    input,              ///< [in] Calling thread's input item.
-        cub::Sum        /*scan_op*/,        ///< [in] Binary scan operator
-        int             first_lane,         ///< [in] Index of first lane in segment
-        int             offset)             ///< [in] Up-offset to pull from
-    {
-        unsigned int output;
-        int shfl_c = first_lane | SHFL_C;   // Shuffle control (mask and first-lane)
-
-        // Use predicate set from SHFL to guard against invalid peers
-#ifdef CUB_USE_COOPERATIVE_GROUPS
-        asm volatile(
-            "{"
-            "  .reg .u32 r0;"
-            "  .reg .pred p;"
-            "  shfl.sync.up.b32 r0|p, %1, %2, %3, %5;"
-            "  @p add.u32 r0, r0, %4;"
-            "  mov.u32 %0, r0;"
-            "}"
-            : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input), "r"(member_mask));
-#else
-        asm volatile(
-            "{"
-            "  .reg .u32 r0;"
-            "  .reg .pred p;"
-            "  shfl.up.b32 r0|p, %1, %2, %3;"
-            "  @p add.u32 r0, r0, %4;"
-            "  mov.u32 %0, r0;"
-            "}"
-            : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input));
-#endif
-
-        return output;
-    }
-
-
-    /// Inclusive prefix scan step (specialized for summation across fp32 types)
-    __device__ __forceinline__ float InclusiveScanStep(
-        float           input,              ///< [in] Calling thread's input item.
-        cub::Sum        /*scan_op*/,        ///< [in] Binary scan operator
-        int             first_lane,         ///< [in] Index of first lane in segment
-        int             offset)             ///< [in] Up-offset to pull from
-    {
-        float output;
-        int shfl_c = first_lane | SHFL_C;   // Shuffle control (mask and first-lane)
-
-        // Use predicate set from SHFL to guard against invalid peers
-#ifdef CUB_USE_COOPERATIVE_GROUPS
-        asm volatile(
-            "{"
-            "  .reg .f32 r0;"
-            "  .reg .pred p;"
-            "  shfl.sync.up.b32 r0|p, %1, %2, %3, %5;"
-            "  @p add.f32 r0, r0, %4;"
-            "  mov.f32 %0, r0;"
-            "}"
-            : "=f"(output) : "f"(input), "r"(offset), "r"(shfl_c), "f"(input), "r"(member_mask));
-#else
-        asm volatile(
-            "{"
-            "  .reg .f32 r0;"
-            "  .reg .pred p;"
-            "  shfl.up.b32 r0|p, %1, %2, %3;"
-            "  @p add.f32 r0, r0, %4;"
-            "  mov.f32 %0, r0;"
-            "}"
-            : "=f"(output) : "f"(input), "r"(offset), "r"(shfl_c), "f"(input));
-#endif
-
-        return output;
-    }
-
-
-    /// Inclusive prefix scan step (specialized for summation across unsigned long long types)
-    __device__ __forceinline__ unsigned long long InclusiveScanStep(
-        unsigned long long  input,              ///< [in] Calling thread's input item.
-        cub::Sum            /*scan_op*/,        ///< [in] Binary scan operator
-        int             first_lane,         ///< [in] Index of first lane in segment
-        int             offset)             ///< [in] Up-offset to pull from
-    {
-        unsigned long long output;
-        int shfl_c = first_lane | SHFL_C;   // Shuffle control (mask and first-lane)
-
-        // Use predicate set from SHFL to guard against invalid peers
-#ifdef CUB_USE_COOPERATIVE_GROUPS
-        asm volatile(
-            "{"
-            "  .reg .u64 r0;"
-            "  .reg .u32 lo;"
-            "  .reg .u32 hi;"
-            "  .reg .pred p;"
-            "  mov.b64 {lo, hi}, %1;"
-            "  shfl.sync.up.b32 lo|p, lo, %2, %3, %5;"
-            "  shfl.sync.up.b32 hi|p, hi, %2, %3, %5;"
-            "  mov.b64 r0, {lo, hi};"
-            "  @p add.u64 r0, r0, %4;"
-            "  mov.u64 %0, r0;"
-            "}"
-            : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input), "r"(member_mask));
-#else
-        asm volatile(
-            "{"
-            "  .reg .u64 r0;"
-            "  .reg .u32 lo;"
-            "  .reg .u32 hi;"
-            "  .reg .pred p;"
-            "  mov.b64 {lo, hi}, %1;"
-            "  shfl.up.b32 lo|p, lo, %2, %3;"
-            "  shfl.up.b32 hi|p, hi, %2, %3;"
-            "  mov.b64 r0, {lo, hi};"
-            "  @p add.u64 r0, r0, %4;"
-            "  mov.u64 %0, r0;"
-            "}"
-            : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input));
-#endif
-
-        return output;
-    }
-
-
-    /// Inclusive prefix scan step (specialized for summation across long long types)
-    __device__ __forceinline__ long long InclusiveScanStep(
-        long long       input,              ///< [in] Calling thread's input item.
-        cub::Sum        /*scan_op*/,        ///< [in] Binary scan operator
-        int             first_lane,         ///< [in] Index of first lane in segment
-        int             offset)             ///< [in] Up-offset to pull from
-    {
-        long long output;
-        int shfl_c = first_lane | SHFL_C;   // Shuffle control (mask and first-lane)
-
-        // Use predicate set from SHFL to guard against invalid peers
-#ifdef CUB_USE_COOPERATIVE_GROUPS
-        asm volatile(
-            "{"
-            "  .reg .s64 r0;"
-            "  .reg .u32 lo;"
-            "  .reg .u32 hi;"
-            "  .reg .pred p;"
-            "  mov.b64 {lo, hi}, %1;"
-            "  shfl.sync.up.b32 lo|p, lo, %2, %3, %5;"
-            "  shfl.sync.up.b32 hi|p, hi, %2, %3, %5;"
-            "  mov.b64 r0, {lo, hi};"
-            "  @p add.s64 r0, r0, %4;"
-            "  mov.s64 %0, r0;"
-            "}"
-            : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input), "r"(member_mask));
-#else
-        asm volatile(
-            "{"
-            "  .reg .s64 r0;"
-            "  .reg .u32 lo;"
-            "  .reg .u32 hi;"
-            "  .reg .pred p;"
-            "  mov.b64 {lo, hi}, %1;"
-            "  shfl.up.b32 lo|p, lo, %2, %3;"
-            "  shfl.up.b32 hi|p, hi, %2, %3;"
-            "  mov.b64 r0, {lo, hi};"
-            "  @p add.s64 r0, r0, %4;"
-            "  mov.s64 %0, r0;"
-            "}"
-            : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input));
-#endif
-
-        return output;
-    }
-
-
-    /// Inclusive prefix scan step (specialized for summation across fp64 types)
-    __device__ __forceinline__ double InclusiveScanStep(
-        double          input,              ///< [in] Calling thread's input item.
-        cub::Sum        /*scan_op*/,        ///< [in] Binary scan operator
-        int             first_lane,         ///< [in] Index of first lane in segment
-        int             offset)             ///< [in] Up-offset to pull from
-    {
-        double output;
-        int shfl_c = first_lane | SHFL_C;   // Shuffle control (mask and first-lane)
-
-        // Use predicate set from SHFL to guard against invalid peers
-#ifdef CUB_USE_COOPERATIVE_GROUPS
-        asm volatile(
-            "{"
-            "  .reg .u32 lo;"
-            "  .reg .u32 hi;"
-            "  .reg .pred p;"
-            "  .reg .f64 r0;"
-            "  mov.b64 %0, %1;"
-            "  mov.b64 {lo, hi}, %1;"
-            "  shfl.sync.up.b32 lo|p, lo, %2, %3, %4;"
-            "  shfl.sync.up.b32 hi|p, hi, %2, %3, %4;"
-            "  mov.b64 r0, {lo, hi};"
-            "  @p add.f64 %0, %0, r0;"
-            "}"
-            : "=d"(output) : "d"(input), "r"(offset), "r"(shfl_c), "r"(member_mask));
-#else
-        asm volatile(
-            "{"
-            "  .reg .u32 lo;"
-            "  .reg .u32 hi;"
-            "  .reg .pred p;"
-            "  .reg .f64 r0;"
-            "  mov.b64 %0, %1;"
-            "  mov.b64 {lo, hi}, %1;"
-            "  shfl.up.b32 lo|p, lo, %2, %3;"
-            "  shfl.up.b32 hi|p, hi, %2, %3;"
-            "  mov.b64 r0, {lo, hi};"
-            "  @p add.f64 %0, %0, r0;"
-            "}"
-            : "=d"(output) : "d"(input), "r"(offset), "r"(shfl_c));
-#endif
-
-        return output;
-    }
-
-
-/*
-    /// Inclusive prefix scan (specialized for ReduceBySegmentOp<cub::Sum> across KeyValuePair<OffsetT, Value> types)
-    template <typename Value, typename OffsetT>
-    __device__ __forceinline__ KeyValuePair<OffsetT, Value>InclusiveScanStep(
-        KeyValuePair<OffsetT, Value>    input,              ///< [in] Calling thread's input item.
-        ReduceBySegmentOp<cub::Sum>     scan_op,            ///< [in] Binary scan operator
-        int                             first_lane,         ///< [in] Index of first lane in segment
-        int                             offset)             ///< [in] Up-offset to pull from
-    {
-        KeyValuePair<OffsetT, Value> output;
-
-        output.value = InclusiveScanStep(input.value, cub::Sum(), first_lane, offset, Int2Type<IntegerTraits<Value>::IS_SMALL_UNSIGNED>());
-        output.key = InclusiveScanStep(input.key, cub::Sum(), first_lane, offset, Int2Type<IntegerTraits<OffsetT>::IS_SMALL_UNSIGNED>());
-
-        if (input.key > 0)
-            output.value = input.value;
-
-        return output;
-    }
-*/
-
-    /// Inclusive prefix scan step (generic)
-    template <typename _T, typename ScanOpT>
-    __device__ __forceinline__ _T InclusiveScanStep(
-        _T              input,              ///< [in] Calling thread's input item.
-        ScanOpT          scan_op,            ///< [in] Binary scan operator
-        int             first_lane,         ///< [in] Index of first lane in segment
-        int             offset)             ///< [in] Up-offset to pull from
-    {
-        _T temp = ShuffleUp(input, offset, first_lane, member_mask);
-
-        // Perform scan op if from a valid peer
-        _T output = scan_op(temp, input);
-        if (static_cast<int>(lane_id) < first_lane + offset)
-            output = input;
-
-        return output;
-    }
-
-
-    /// Inclusive prefix scan step (specialized for small integers size 32b or less)
-    template <typename _T, typename ScanOpT>
-    __device__ __forceinline__ _T InclusiveScanStep(
-        _T              input,              ///< [in] Calling thread's input item.
-        ScanOpT          scan_op,            ///< [in] Binary scan operator
-        int             first_lane,         ///< [in] Index of first lane in segment
-        int             offset,             ///< [in] Up-offset to pull from
-        Int2Type<true>  /*is_small_unsigned*/)  ///< [in] Marker type indicating whether T is a small integer
-    {
-        return InclusiveScanStep(input, scan_op, first_lane, offset);
-    }
-
-
-    /// Inclusive prefix scan step (specialized for types other than small integers size 32b or less)
-    template <typename _T, typename ScanOpT>
-    __device__ __forceinline__ _T InclusiveScanStep(
-        _T              input,              ///< [in] Calling thread's input item.
-        ScanOpT          scan_op,            ///< [in] Binary scan operator
-        int             first_lane,         ///< [in] Index of first lane in segment
-        int             offset,             ///< [in] Up-offset to pull from
-        Int2Type<false> /*is_small_unsigned*/)  ///< [in] Marker type indicating whether T is a small integer
-    {
-        return InclusiveScanStep(input, scan_op, first_lane, offset);
-    }
-
-    //---------------------------------------------------------------------
-    // Templated inclusive scan iteration
-    //---------------------------------------------------------------------
-
-    template <typename _T, typename ScanOp, int STEP>
-    __device__ __forceinline__ void InclusiveScanStep(
-        _T&             input,              ///< [in] Calling thread's input item.
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        int             first_lane,         ///< [in] Index of first lane in segment
-        Int2Type<STEP>  /*step*/)               ///< [in] Marker type indicating scan step
-    {
-        input = InclusiveScanStep(input, scan_op, first_lane, 1 << STEP, Int2Type<IntegerTraits<T>::IS_SMALL_UNSIGNED>());
-
-        InclusiveScanStep(input, scan_op, first_lane, Int2Type<STEP + 1>());
-    }
-
-    template <typename _T, typename ScanOp>
-    __device__ __forceinline__ void InclusiveScanStep(
-        _T&             /*input*/,              ///< [in] Calling thread's input item.
-        ScanOp          /*scan_op*/,            ///< [in] Binary scan operator
-        int             /*first_lane*/,         ///< [in] Index of first lane in segment
-        Int2Type<STEPS> /*step*/)               ///< [in] Marker type indicating scan step
-    {}
-
-
-    /******************************************************************************
-     * Interface
-     ******************************************************************************/
-
-    //---------------------------------------------------------------------
-    // Broadcast
-    //---------------------------------------------------------------------
-
-    /// Broadcast
-    __device__ __forceinline__ T Broadcast(
-        T               input,              ///< [in] The value to broadcast
-        int             src_lane)           ///< [in] Which warp lane is to do the broadcasting
-    {
-        return ShuffleIndex(input, src_lane, LOGICAL_WARP_THREADS, member_mask);
-    }
-
-
-    //---------------------------------------------------------------------
-    // Inclusive operations
-    //---------------------------------------------------------------------
-
-    /// Inclusive scan
-    template <typename _T, typename ScanOpT>
-    __device__ __forceinline__ void InclusiveScan(
-        _T              input,              ///< [in] Calling thread's input item.
-        _T              &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
-        ScanOpT         scan_op)            ///< [in] Binary scan operator
-    {
-        inclusive_output = input;
-
-        // Iterate scan steps
-        int segment_first_lane = 0;
-
-        // Iterate scan steps
-//        InclusiveScanStep(inclusive_output, scan_op, segment_first_lane, Int2Type<0>());
-
-        // Iterate scan steps
-        #pragma unroll
-        for (int STEP = 0; STEP < STEPS; STEP++)
-        {
-            inclusive_output = InclusiveScanStep(
-                inclusive_output,
-                scan_op,
-                segment_first_lane,
-                (1 << STEP),
-                Int2Type<IntegerTraits<T>::IS_SMALL_UNSIGNED>());
-        }
-
-    }
-
-    /// Inclusive scan, specialized for reduce-value-by-key
-    template <typename KeyT, typename ValueT, typename ReductionOpT>
-    __device__ __forceinline__ void InclusiveScan(
-        KeyValuePair<KeyT, ValueT>      input,              ///< [in] Calling thread's input item.
-        KeyValuePair<KeyT, ValueT>      &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
-        ReduceByKeyOp<ReductionOpT >    scan_op)            ///< [in] Binary scan operator
-    {
-        inclusive_output = input;
-
-        KeyT pred_key = ShuffleUp(inclusive_output.key, 1, 0, member_mask);
-
-        unsigned int ballot = WARP_BALLOT((pred_key != inclusive_output.key), member_mask);
-
-        // Mask away all lanes greater than ours
-        ballot = ballot & LaneMaskLe();
-
-        // Find index of first set bit
-        int segment_first_lane = CUB_MAX(0, 31 - __clz(ballot));
-
-        // Iterate scan steps
-//        InclusiveScanStep(inclusive_output.value, scan_op.op, segment_first_lane, Int2Type<0>());
-
-        // Iterate scan steps
-        #pragma unroll
-        for (int STEP = 0; STEP < STEPS; STEP++)
-        {
-            inclusive_output.value = InclusiveScanStep(
-                inclusive_output.value,
-                scan_op.op,
-                segment_first_lane,
-                (1 << STEP),
-                Int2Type<IntegerTraits<T>::IS_SMALL_UNSIGNED>());
-        }
-    }
-
-
-    /// Inclusive scan with aggregate
-    template <typename ScanOpT>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
-        ScanOpT         scan_op,            ///< [in] Binary scan operator
-        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
-    {
-        InclusiveScan(input, inclusive_output, scan_op);
-
-        // Grab aggregate from last warp lane
-        warp_aggregate = ShuffleIndex(inclusive_output, LOGICAL_WARP_THREADS - 1, LOGICAL_WARP_THREADS, member_mask);
-    }
-
-
-    //---------------------------------------------------------------------
-    // Get exclusive from inclusive
-    //---------------------------------------------------------------------
-
-    /// Update inclusive and exclusive using input and inclusive
-    template <typename ScanOpT, typename IsIntegerT>
-    __device__ __forceinline__ void Update(
-        T                       /*input*/,          ///< [in]
-        T                       &inclusive,         ///< [in, out]
-        T                       &exclusive,         ///< [out]
-        ScanOpT                 /*scan_op*/,        ///< [in]
-        IsIntegerT              /*is_integer*/)     ///< [in]
-    {
-        // initial value unknown
-        exclusive = ShuffleUp(inclusive, 1, 0, member_mask);
-    }
-
-    /// Update inclusive and exclusive using input and inclusive (specialized for summation of integer types)
-    __device__ __forceinline__ void Update(
-        T                       input,
-        T                       &inclusive,
-        T                       &exclusive,
-        cub::Sum                /*scan_op*/,
-        Int2Type<true>          /*is_integer*/)
-    {
-        // initial value presumed 0
-        exclusive = inclusive - input;
-    }
-
-    /// Update inclusive and exclusive using initial value using input, inclusive, and initial value
-    template <typename ScanOpT, typename IsIntegerT>
-    __device__ __forceinline__ void Update (
-        T                       /*input*/,
-        T                       &inclusive,
-        T                       &exclusive,
-        ScanOpT                 scan_op,
-        T                       initial_value,
-        IsIntegerT              /*is_integer*/)
-    {
-        inclusive = scan_op(initial_value, inclusive);
-        exclusive = ShuffleUp(inclusive, 1, 0, member_mask);
-
-        unsigned int segment_id = (IS_ARCH_WARP) ?
-            lane_id :
-            lane_id % LOGICAL_WARP_THREADS;
-
-        if (segment_id == 0)
-            exclusive = initial_value;
-    }
-
-    /// Update inclusive and exclusive using initial value using input and inclusive (specialized for summation of integer types)
-    __device__ __forceinline__ void Update (
-        T                       input,
-        T                       &inclusive,
-        T                       &exclusive,
-        cub::Sum                scan_op,
-        T                       initial_value,
-        Int2Type<true>          /*is_integer*/)
-    {
-        inclusive = scan_op(initial_value, inclusive);
-        exclusive = inclusive - input;
-    }
-
-
-    /// Update inclusive, exclusive, and warp aggregate using input and inclusive
-    template <typename ScanOpT, typename IsIntegerT>
-    __device__ __forceinline__ void Update (
-        T                       input,
-        T                       &inclusive,
-        T                       &exclusive,
-        T                       &warp_aggregate,
-        ScanOpT                 scan_op,
-        IsIntegerT              is_integer)
-    {
-        warp_aggregate = ShuffleIndex(inclusive, LOGICAL_WARP_THREADS - 1, LOGICAL_WARP_THREADS, member_mask);
-        Update(input, inclusive, exclusive, scan_op, is_integer);
-    }
-
-    /// Update inclusive, exclusive, and warp aggregate using input, inclusive, and initial value
-    template <typename ScanOpT, typename IsIntegerT>
-    __device__ __forceinline__ void Update (
-        T                       input,
-        T                       &inclusive,
-        T                       &exclusive,
-        T                       &warp_aggregate,
-        ScanOpT                 scan_op,
-        T                       initial_value,
-        IsIntegerT              is_integer)
-    {
-        warp_aggregate = ShuffleIndex(inclusive, LOGICAL_WARP_THREADS - 1, LOGICAL_WARP_THREADS, member_mask);
-        Update(input, inclusive, exclusive, scan_op, initial_value, is_integer);
-    }
-
-
-
-};
-
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_smem.cuh b/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_smem.cuh
deleted file mode 100644
index 5bafb3559..000000000
--- a/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_smem.cuh
+++ /dev/null
@@ -1,397 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::WarpScanSmem provides smem-based variants of parallel prefix scan of items partitioned across a CUDA thread warp.
- */
-
-#pragma once
-
-#include "../../thread/thread_operators.cuh"
-#include "../../thread/thread_load.cuh"
-#include "../../thread/thread_store.cuh"
-#include "../../util_type.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \brief WarpScanSmem provides smem-based variants of parallel prefix scan of items partitioned across a CUDA thread warp.
- */
-template <
-    typename    T,                      ///< Data type being scanned
-    int         LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
-    int         PTX_ARCH>               ///< The PTX compute capability for which to to specialize this collective
-struct WarpScanSmem
-{
-    /******************************************************************************
-     * Constants and type definitions
-     ******************************************************************************/
-
-    enum
-    {
-        /// Whether the logical warp size and the PTX warp size coincide
-        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
-
-        /// Whether the logical warp size is a power-of-two
-        IS_POW_OF_TWO = PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE,
-
-        /// The number of warp scan steps
-        STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,
-
-        /// The number of threads in half a warp
-        HALF_WARP_THREADS = 1 << (STEPS - 1),
-
-        /// The number of shared memory elements per warp
-        WARP_SMEM_ELEMENTS =  LOGICAL_WARP_THREADS + HALF_WARP_THREADS,
-    };
-
-    /// Storage cell type (workaround for SM1x compiler bugs with custom-ops like Max() on signed chars)
-    typedef typename If<((Equals<T, char>::VALUE || Equals<T, signed char>::VALUE) && (PTX_ARCH < 200)), int, T>::Type CellT;
-
-    /// Shared memory storage layout type (1.5 warps-worth of elements for each warp)
-    typedef CellT _TempStorage[WARP_SMEM_ELEMENTS];
-
-    // Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    _TempStorage    &temp_storage;
-    unsigned int    lane_id;
-    unsigned int    member_mask;
-
-
-    /******************************************************************************
-     * Construction
-     ******************************************************************************/
-
-    /// Constructor
-    __device__ __forceinline__ WarpScanSmem(
-        TempStorage     &temp_storage)
-    :
-        temp_storage(temp_storage.Alias()),
-
-        lane_id(IS_ARCH_WARP ?
-            LaneId() :
-            LaneId() % LOGICAL_WARP_THREADS),
-
-        member_mask((0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << ((IS_ARCH_WARP || !IS_POW_OF_TWO ) ?
-            0 : // arch-width and non-power-of-two subwarps cannot be tiled with the arch-warp
-            ((LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS)))
-    {}
-
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-    /// Basic inclusive scan iteration (template unrolled, inductive-case specialization)
-    template <
-        bool        HAS_IDENTITY,
-        int         STEP,
-        typename    ScanOp>
-    __device__ __forceinline__ void ScanStep(
-        T                       &partial,
-        ScanOp                  scan_op,
-        Int2Type<STEP>          /*step*/)
-    {
-        const int OFFSET = 1 << STEP;
-
-        // Share partial into buffer
-        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) partial);
-
-        WARP_SYNC(member_mask);
-
-        // Update partial if addend is in range
-        if (HAS_IDENTITY || (lane_id >= OFFSET))
-        {
-            T addend = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - OFFSET]);
-            partial = scan_op(addend, partial);
-        }
-        WARP_SYNC(member_mask);
-
-        ScanStep<HAS_IDENTITY>(partial, scan_op, Int2Type<STEP + 1>());
-    }
-
-
-    /// Basic inclusive scan iteration(template unrolled, base-case specialization)
-    template <
-        bool        HAS_IDENTITY,
-        typename    ScanOp>
-    __device__ __forceinline__ void ScanStep(
-        T                       &/*partial*/,
-        ScanOp                  /*scan_op*/,
-        Int2Type<STEPS>         /*step*/)
-    {}
-
-
-    /// Inclusive prefix scan (specialized for summation across primitive types)
-    __device__ __forceinline__ void InclusiveScan(
-        T                       input,              ///< [in] Calling thread's input item.
-        T                       &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
-        Sum                     scan_op,            ///< [in] Binary scan operator
-        Int2Type<true>          /*is_primitive*/)   ///< [in] Marker type indicating whether T is primitive type
-    {
-        T identity = 0;
-        ThreadStore<STORE_VOLATILE>(&temp_storage[lane_id], (CellT) identity);
-
-        WARP_SYNC(member_mask);
-
-        // Iterate scan steps
-        output = input;
-        ScanStep<true>(output, scan_op, Int2Type<0>());
-    }
-
-
-    /// Inclusive prefix scan
-    template <typename ScanOp, int IS_PRIMITIVE>
-    __device__ __forceinline__ void InclusiveScan(
-        T                       input,              ///< [in] Calling thread's input item.
-        T                       &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
-        ScanOp                  scan_op,            ///< [in] Binary scan operator
-        Int2Type<IS_PRIMITIVE>  /*is_primitive*/)   ///< [in] Marker type indicating whether T is primitive type
-    {
-        // Iterate scan steps
-        output = input;
-        ScanStep<false>(output, scan_op, Int2Type<0>());
-    }
-
-
-    /******************************************************************************
-     * Interface
-     ******************************************************************************/
-
-    //---------------------------------------------------------------------
-    // Broadcast
-    //---------------------------------------------------------------------
-
-    /// Broadcast
-    __device__ __forceinline__ T Broadcast(
-        T               input,              ///< [in] The value to broadcast
-        unsigned int    src_lane)           ///< [in] Which warp lane is to do the broadcasting
-    {
-        if (lane_id == src_lane)
-        {
-            ThreadStore<STORE_VOLATILE>(temp_storage, (CellT) input);
-        }
-
-        WARP_SYNC(member_mask);
-
-        return (T)ThreadLoad<LOAD_VOLATILE>(temp_storage);
-    }
-
-
-    //---------------------------------------------------------------------
-    // Inclusive operations
-    //---------------------------------------------------------------------
-
-    /// Inclusive scan
-    template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
-        ScanOp          scan_op)            ///< [in] Binary scan operator
-    {
-        InclusiveScan(input, inclusive_output, scan_op, Int2Type<Traits<T>::PRIMITIVE>());
-    }
-
-
-    /// Inclusive scan with aggregate
-    template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
-    {
-        InclusiveScan(input, inclusive_output, scan_op);
-
-        // Retrieve aggregate
-        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive_output);
-
-        WARP_SYNC(member_mask);
-
-        warp_aggregate = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[WARP_SMEM_ELEMENTS - 1]);
-
-        WARP_SYNC(member_mask);
-    }
-
-
-    //---------------------------------------------------------------------
-    // Get exclusive from inclusive
-    //---------------------------------------------------------------------
-
-    /// Update inclusive and exclusive using input and inclusive
-    template <typename ScanOpT, typename IsIntegerT>
-    __device__ __forceinline__ void Update(
-        T                       /*input*/,      ///< [in]
-        T                       &inclusive,     ///< [in, out]
-        T                       &exclusive,     ///< [out]
-        ScanOpT                 /*scan_op*/,    ///< [in]
-        IsIntegerT              /*is_integer*/) ///< [in]
-    {
-        // initial value unknown
-        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
-
-        WARP_SYNC(member_mask);
-
-        exclusive = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - 1]);
-    }
-
-    /// Update inclusive and exclusive using input and inclusive (specialized for summation of integer types)
-    __device__ __forceinline__ void Update(
-        T                       input,
-        T                       &inclusive,
-        T                       &exclusive,
-        cub::Sum                /*scan_op*/,
-        Int2Type<true>          /*is_integer*/)
-    {
-        // initial value presumed 0
-        exclusive = inclusive - input;
-    }
-
-    /// Update inclusive and exclusive using initial value using input, inclusive, and initial value
-    template <typename ScanOpT, typename IsIntegerT>
-    __device__ __forceinline__ void Update (
-        T                       /*input*/,
-        T                       &inclusive,
-        T                       &exclusive,
-        ScanOpT                 scan_op,
-        T                       initial_value,
-        IsIntegerT              /*is_integer*/)
-    {
-        inclusive = scan_op(initial_value, inclusive);
-        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
-
-        WARP_SYNC(member_mask);
-
-        exclusive = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - 1]);
-        if (lane_id == 0)
-            exclusive = initial_value;
-    }
-
-    /// Update inclusive and exclusive using initial value using input and inclusive (specialized for summation of integer types)
-    __device__ __forceinline__ void Update (
-        T                       input,
-        T                       &inclusive,
-        T                       &exclusive,
-        cub::Sum                scan_op,
-        T                       initial_value,
-        Int2Type<true>          /*is_integer*/)
-    {
-        inclusive = scan_op(initial_value, inclusive);
-        exclusive = inclusive - input;
-    }
-
-
-    /// Update inclusive, exclusive, and warp aggregate using input and inclusive
-    template <typename ScanOpT, typename IsIntegerT>
-    __device__ __forceinline__ void Update (
-        T                       /*input*/,
-        T                       &inclusive,
-        T                       &exclusive,
-        T                       &warp_aggregate,
-        ScanOpT                 /*scan_op*/,
-        IsIntegerT              /*is_integer*/)
-    {
-        // Initial value presumed to be unknown or identity (either way our padding is correct)
-        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
-
-        WARP_SYNC(member_mask);
-
-        exclusive = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - 1]);
-        warp_aggregate = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[WARP_SMEM_ELEMENTS - 1]);
-    }
-
-    /// Update inclusive, exclusive, and warp aggregate using input and inclusive (specialized for summation of integer types)
-    __device__ __forceinline__ void Update (
-        T                       input,
-        T                       &inclusive,
-        T                       &exclusive,
-        T                       &warp_aggregate,
-        cub::Sum                /*scan_o*/,
-        Int2Type<true>          /*is_integer*/)
-    {
-        // Initial value presumed to be unknown or identity (either way our padding is correct)
-        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
-
-        WARP_SYNC(member_mask);
-
-        warp_aggregate = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[WARP_SMEM_ELEMENTS - 1]);
-        exclusive = inclusive - input;
-    }
-
-    /// Update inclusive, exclusive, and warp aggregate using input, inclusive, and initial value
-    template <typename ScanOpT, typename IsIntegerT>
-    __device__ __forceinline__ void Update (
-        T                       /*input*/,
-        T                       &inclusive,
-        T                       &exclusive,
-        T                       &warp_aggregate,
-        ScanOpT                 scan_op,
-        T                       initial_value,
-        IsIntegerT              /*is_integer*/)
-    {
-        // Broadcast warp aggregate
-        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
-
-        WARP_SYNC(member_mask);
-
-        warp_aggregate = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[WARP_SMEM_ELEMENTS - 1]);
-
-        WARP_SYNC(member_mask);
-
-        // Update inclusive with initial value
-        inclusive = scan_op(initial_value, inclusive);
-
-        // Get exclusive from exclusive
-        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - 1], (CellT) inclusive);
-
-        WARP_SYNC(member_mask);
-
-        exclusive = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - 2]);
-
-        if (lane_id == 0)
-            exclusive = initial_value;
-    }
-
-
-};
-
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/system/cuda/detail/cub/warp/warp_reduce.cuh b/thrust/system/cuda/detail/cub/warp/warp_reduce.cuh
deleted file mode 100644
index baef93594..000000000
--- a/thrust/system/cuda/detail/cub/warp/warp_reduce.cuh
+++ /dev/null
@@ -1,612 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * The cub::WarpReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread warp.
- */
-
-#pragma once
-
-#include "specializations/warp_reduce_shfl.cuh"
-#include "specializations/warp_reduce_smem.cuh"
-#include "../thread/thread_operators.cuh"
-#include "../util_arch.cuh"
-#include "../util_type.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup WarpModule
- * @{
- */
-
-/**
- * \brief The WarpReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread warp. ![](warp_reduce_logo.png)
- *
- * \tparam T                        The reduction input/output element type
- * \tparam LOGICAL_WARP_THREADS     <b>[optional]</b> The number of threads per "logical" warp (may be less than the number of hardware warp threads).  Default is the warp size of the targeted CUDA compute-capability (e.g., 32 threads for SM20).
- * \tparam PTX_ARCH                 <b>[optional]</b> \ptxversion
- *
- * \par Overview
- * - A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
- *   uses a binary combining operator to compute a single aggregate from a list of input elements.
- * - Supports "logical" warps smaller than the physical warp size (e.g., logical warps of 8 threads)
- * - The number of entrant threads must be an multiple of \p LOGICAL_WARP_THREADS
- *
- * \par Performance Considerations
- * - Uses special instructions when applicable (e.g., warp \p SHFL instructions)
- * - Uses synchronization-free communication between warp lanes when applicable
- * - Incurs zero bank conflicts for most types
- * - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
- *     - Summation (<b><em>vs.</em></b> generic reduction)
- *     - The architecture's warp size is a whole multiple of \p LOGICAL_WARP_THREADS
- *
- * \par Simple Examples
- * \warpcollective{WarpReduce}
- * \par
- * The code snippet below illustrates four concurrent warp sum reductions within a block of
- * 128 threads (one per each of the 32-thread warps).
- * \par
- * \code
- * #include <cub/cub.cuh>
- *
- * __global__ void ExampleKernel(...)
- * {
- *     // Specialize WarpReduce for type int
- *     typedef cub::WarpReduce<int> WarpReduce;
- *
- *     // Allocate WarpReduce shared memory for 4 warps
- *     __shared__ typename WarpReduce::TempStorage temp_storage[4];
- *
- *     // Obtain one input item per thread
- *     int thread_data = ...
- *
- *     // Return the warp-wide sums to each lane0 (threads 0, 32, 64, and 96)
- *     int warp_id = threadIdx.x / 32;
- *     int aggregate = WarpReduce(temp_storage[warp_id]).Sum(thread_data);
- *
- * \endcode
- * \par
- * Suppose the set of input \p thread_data across the block of threads is <tt>{0, 1, 2, 3, ..., 127}</tt>.
- * The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 496, \p 1520,
- * \p 2544, and \p 3568, respectively (and is undefined in other threads).
- *
- * \par
- * The code snippet below illustrates a single warp sum reduction within a block of
- * 128 threads.
- * \par
- * \code
- * #include <cub/cub.cuh>
- *
- * __global__ void ExampleKernel(...)
- * {
- *     // Specialize WarpReduce for type int
- *     typedef cub::WarpReduce<int> WarpReduce;
- *
- *     // Allocate WarpReduce shared memory for one warp
- *     __shared__ typename WarpReduce::TempStorage temp_storage;
- *     ...
- *
- *     // Only the first warp performs a reduction
- *     if (threadIdx.x < 32)
- *     {
- *         // Obtain one input item per thread
- *         int thread_data = ...
- *
- *         // Return the warp-wide sum to lane0
- *         int aggregate = WarpReduce(temp_storage).Sum(thread_data);
- *
- * \endcode
- * \par
- * Suppose the set of input \p thread_data across the warp of threads is <tt>{0, 1, 2, 3, ..., 31}</tt>.
- * The corresponding output \p aggregate in thread0 will be \p 496 (and is undefined in other threads).
- *
- */
-template <
-    typename    T,
-    int         LOGICAL_WARP_THREADS    = CUB_PTX_WARP_THREADS,
-    int         PTX_ARCH                = CUB_PTX_ARCH>
-class WarpReduce
-{
-private:
-
-    /******************************************************************************
-     * Constants and type definitions
-     ******************************************************************************/
-
-    enum
-    {
-        /// Whether the logical warp size and the PTX warp size coincide
-        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
-
-        /// Whether the logical warp size is a power-of-two
-        IS_POW_OF_TWO = PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE,
-    };
-
-public:
-
-    #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-    /// Internal specialization.  Use SHFL-based reduction if (architecture is >= SM30) and (LOGICAL_WARP_THREADS is a power-of-two)
-    typedef typename If<(PTX_ARCH >= 300) && (IS_POW_OF_TWO),
-        WarpReduceShfl<T, LOGICAL_WARP_THREADS, PTX_ARCH>,
-        WarpReduceSmem<T, LOGICAL_WARP_THREADS, PTX_ARCH> >::Type InternalWarpReduce;
-
-    #endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-private:
-
-    /// Shared memory storage layout type for WarpReduce
-    typedef typename InternalWarpReduce::TempStorage _TempStorage;
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    /// Shared storage reference
-    _TempStorage &temp_storage;
-
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-public:
-
-    /// \smemstorage{WarpReduce}
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    /******************************************************************//**
-     * \name Collective constructors
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.  Logical warp and lane identifiers are constructed from <tt>threadIdx.x</tt>.
-     */
-    __device__ __forceinline__ WarpReduce(
-        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias())
-    {}
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Summation reductions
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes a warp-wide sum in the calling warp.  The output is valid in warp <em>lane</em><sub>0</sub>.
-     *
-     * \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates four concurrent warp sum reductions within a block of
-     * 128 threads (one per each of the 32-thread warps).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpReduce for type int
-     *     typedef cub::WarpReduce<int> WarpReduce;
-     *
-     *     // Allocate WarpReduce shared memory for 4 warps
-     *     __shared__ typename WarpReduce::TempStorage temp_storage[4];
-     *
-     *     // Obtain one input item per thread
-     *     int thread_data = ...
-     *
-     *     // Return the warp-wide sums to each lane0
-     *     int warp_id = threadIdx.x / 32;
-     *     int aggregate = WarpReduce(temp_storage[warp_id]).Sum(thread_data);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, 1, 2, 3, ..., 127}</tt>.
-     * The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 496, \p 1520,
-     * \p 2544, and \p 3568, respectively (and is undefined in other threads).
-     *
-     */
-    __device__ __forceinline__ T Sum(
-        T                   input)              ///< [in] Calling thread's input
-    {
-        return InternalWarpReduce(temp_storage).template Reduce<true, 1>(input, LOGICAL_WARP_THREADS, cub::Sum());
-    }
-
-    /**
-     * \brief Computes a partially-full warp-wide sum in the calling warp.  The output is valid in warp <em>lane</em><sub>0</sub>.
-     *
-     * All threads across the calling warp must agree on the same value for \p valid_items.  Otherwise the result is undefined.
-     *
-     * \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a sum reduction within a single, partially-full
-     * block of 32 threads (one warp).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(int *d_data, int valid_items)
-     * {
-     *     // Specialize WarpReduce for type int
-     *     typedef cub::WarpReduce<int> WarpReduce;
-     *
-     *     // Allocate WarpReduce shared memory for one warp
-     *     __shared__ typename WarpReduce::TempStorage temp_storage;
-     *
-     *     // Obtain one input item per thread if in range
-     *     int thread_data;
-     *     if (threadIdx.x < valid_items)
-     *         thread_data = d_data[threadIdx.x];
-     *
-     *     // Return the warp-wide sums to each lane0
-     *     int aggregate = WarpReduce(temp_storage).Sum(
-     *         thread_data, valid_items);
-     *
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>{0, 1, 2, 3, 4, ...</tt> and \p valid_items
-     * is \p 4.  The corresponding output \p aggregate in thread0 is \p 6 (and is
-     * undefined in other threads).
-     *
-     */
-    __device__ __forceinline__ T Sum(
-        T                   input,              ///< [in] Calling thread's input
-        int                 valid_items)        ///< [in] Total number of valid items in the calling thread's logical warp (may be less than \p LOGICAL_WARP_THREADS)
-    {
-        // Determine if we don't need bounds checking
-        return InternalWarpReduce(temp_storage).template Reduce<false, 1>(input, valid_items, cub::Sum());
-    }
-
-
-    /**
-     * \brief Computes a segmented sum in the calling warp where segments are defined by head-flags.  The sum of each segment is returned to the first lane in that segment (which always includes <em>lane</em><sub>0</sub>).
-     *
-     * \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a head-segmented warp sum
-     * reduction within a block of 32 threads (one warp).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpReduce for type int
-     *     typedef cub::WarpReduce<int> WarpReduce;
-     *
-     *     // Allocate WarpReduce shared memory for one warp
-     *     __shared__ typename WarpReduce::TempStorage temp_storage;
-     *
-     *     // Obtain one input item and flag per thread
-     *     int thread_data = ...
-     *     int head_flag = ...
-     *
-     *     // Return the warp-wide sums to each lane0
-     *     int aggregate = WarpReduce(temp_storage).HeadSegmentedSum(
-     *         thread_data, head_flag);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data and \p head_flag across the block of threads
-     * is <tt>{0, 1, 2, 3, ..., 31</tt> and is <tt>{1, 0, 0, 0, 1, 0, 0, 0, ..., 1, 0, 0, 0</tt>,
-     * respectively.  The corresponding output \p aggregate in threads 0, 4, 8, etc. will be
-     * \p 6, \p 22, \p 38, etc. (and is undefined in other threads).
-     *
-     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     *
-     */
-    template <
-        typename            FlagT>
-    __device__ __forceinline__ T HeadSegmentedSum(
-        T                   input,              ///< [in] Calling thread's input
-        FlagT                head_flag)          ///< [in] Head flag denoting whether or not \p input is the start of a new segment
-    {
-        return HeadSegmentedReduce(input, head_flag, cub::Sum());
-    }
-
-
-    /**
-     * \brief Computes a segmented sum in the calling warp where segments are defined by tail-flags.  The sum of each segment is returned to the first lane in that segment (which always includes <em>lane</em><sub>0</sub>).
-     *
-     * \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a tail-segmented warp sum
-     * reduction within a block of 32 threads (one warp).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpReduce for type int
-     *     typedef cub::WarpReduce<int> WarpReduce;
-     *
-     *     // Allocate WarpReduce shared memory for one warp
-     *     __shared__ typename WarpReduce::TempStorage temp_storage;
-     *
-     *     // Obtain one input item and flag per thread
-     *     int thread_data = ...
-     *     int tail_flag = ...
-     *
-     *     // Return the warp-wide sums to each lane0
-     *     int aggregate = WarpReduce(temp_storage).TailSegmentedSum(
-     *         thread_data, tail_flag);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data and \p tail_flag across the block of threads
-     * is <tt>{0, 1, 2, 3, ..., 31</tt> and is <tt>{0, 0, 0, 1, 0, 0, 0, 1, ..., 0, 0, 0, 1</tt>,
-     * respectively.  The corresponding output \p aggregate in threads 0, 4, 8, etc. will be
-     * \p 6, \p 22, \p 38, etc. (and is undefined in other threads).
-     *
-     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <
-        typename            FlagT>
-    __device__ __forceinline__ T TailSegmentedSum(
-        T                   input,              ///< [in] Calling thread's input
-        FlagT                tail_flag)          ///< [in] Head flag denoting whether or not \p input is the start of a new segment
-    {
-        return TailSegmentedReduce(input, tail_flag, cub::Sum());
-    }
-
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Generic reductions
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Computes a warp-wide reduction in the calling warp using the specified binary reduction functor.  The output is valid in warp <em>lane</em><sub>0</sub>.
-     *
-     * Supports non-commutative reduction operators
-     *
-     * \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates four concurrent warp max reductions within a block of
-     * 128 threads (one per each of the 32-thread warps).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpReduce for type int
-     *     typedef cub::WarpReduce<int> WarpReduce;
-     *
-     *     // Allocate WarpReduce shared memory for 4 warps
-     *     __shared__ typename WarpReduce::TempStorage temp_storage[4];
-     *
-     *     // Obtain one input item per thread
-     *     int thread_data = ...
-     *
-     *     // Return the warp-wide reductions to each lane0
-     *     int warp_id = threadIdx.x / 32;
-     *     int aggregate = WarpReduce(temp_storage[warp_id]).Reduce(
-     *         thread_data, cub::Max());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, 1, 2, 3, ..., 127}</tt>.
-     * The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 31, \p 63,
-     * \p 95, and \p 127, respectively  (and is undefined in other threads).
-     *
-     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ReductionOp>
-    __device__ __forceinline__ T Reduce(
-        T                   input,              ///< [in] Calling thread's input
-        ReductionOp         reduction_op)       ///< [in] Binary reduction operator
-    {
-        return InternalWarpReduce(temp_storage).template Reduce<true, 1>(input, LOGICAL_WARP_THREADS, reduction_op);
-    }
-
-    /**
-     * \brief Computes a partially-full warp-wide reduction in the calling warp using the specified binary reduction functor.  The output is valid in warp <em>lane</em><sub>0</sub>.
-     *
-     * All threads across the calling warp must agree on the same value for \p valid_items.  Otherwise the result is undefined.
-     *
-     * Supports non-commutative reduction operators
-     *
-     * \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a max reduction within a single, partially-full
-     * block of 32 threads (one warp).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(int *d_data, int valid_items)
-     * {
-     *     // Specialize WarpReduce for type int
-     *     typedef cub::WarpReduce<int> WarpReduce;
-     *
-     *     // Allocate WarpReduce shared memory for one warp
-     *     __shared__ typename WarpReduce::TempStorage temp_storage;
-     *
-     *     // Obtain one input item per thread if in range
-     *     int thread_data;
-     *     if (threadIdx.x < valid_items)
-     *         thread_data = d_data[threadIdx.x];
-     *
-     *     // Return the warp-wide reductions to each lane0
-     *     int aggregate = WarpReduce(temp_storage).Reduce(
-     *         thread_data, cub::Max(), valid_items);
-     *
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>{0, 1, 2, 3, 4, ...</tt> and \p valid_items
-     * is \p 4.  The corresponding output \p aggregate in thread0 is \p 3 (and is
-     * undefined in other threads).
-     *
-     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ReductionOp>
-    __device__ __forceinline__ T Reduce(
-        T                   input,              ///< [in] Calling thread's input
-        ReductionOp         reduction_op,       ///< [in] Binary reduction operator
-        int                 valid_items)        ///< [in] Total number of valid items in the calling thread's logical warp (may be less than \p LOGICAL_WARP_THREADS)
-    {
-        return InternalWarpReduce(temp_storage).template Reduce<false, 1>(input, valid_items, reduction_op);
-    }
-
-
-    /**
-     * \brief Computes a segmented reduction in the calling warp where segments are defined by head-flags.  The reduction of each segment is returned to the first lane in that segment (which always includes <em>lane</em><sub>0</sub>).
-     *
-     * Supports non-commutative reduction operators
-     *
-     * \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a head-segmented warp max
-     * reduction within a block of 32 threads (one warp).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpReduce for type int
-     *     typedef cub::WarpReduce<int> WarpReduce;
-     *
-     *     // Allocate WarpReduce shared memory for one warp
-     *     __shared__ typename WarpReduce::TempStorage temp_storage;
-     *
-     *     // Obtain one input item and flag per thread
-     *     int thread_data = ...
-     *     int head_flag = ...
-     *
-     *     // Return the warp-wide reductions to each lane0
-     *     int aggregate = WarpReduce(temp_storage).HeadSegmentedReduce(
-     *         thread_data, head_flag, cub::Max());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data and \p head_flag across the block of threads
-     * is <tt>{0, 1, 2, 3, ..., 31</tt> and is <tt>{1, 0, 0, 0, 1, 0, 0, 0, ..., 1, 0, 0, 0</tt>,
-     * respectively.  The corresponding output \p aggregate in threads 0, 4, 8, etc. will be
-     * \p 3, \p 7, \p 11, etc. (and is undefined in other threads).
-     *
-     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <
-        typename            ReductionOp,
-        typename            FlagT>
-    __device__ __forceinline__ T HeadSegmentedReduce(
-        T                   input,              ///< [in] Calling thread's input
-        FlagT                head_flag,          ///< [in] Head flag denoting whether or not \p input is the start of a new segment
-        ReductionOp         reduction_op)       ///< [in] Reduction operator
-    {
-        return InternalWarpReduce(temp_storage).template SegmentedReduce<true>(input, head_flag, reduction_op);
-    }
-
-
-    /**
-     * \brief Computes a segmented reduction in the calling warp where segments are defined by tail-flags.  The reduction of each segment is returned to the first lane in that segment (which always includes <em>lane</em><sub>0</sub>).
-     *
-     * Supports non-commutative reduction operators
-     *
-     * \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a tail-segmented warp max
-     * reduction within a block of 32 threads (one warp).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpReduce for type int
-     *     typedef cub::WarpReduce<int> WarpReduce;
-     *
-     *     // Allocate WarpReduce shared memory for one warp
-     *     __shared__ typename WarpReduce::TempStorage temp_storage;
-     *
-     *     // Obtain one input item and flag per thread
-     *     int thread_data = ...
-     *     int tail_flag = ...
-     *
-     *     // Return the warp-wide reductions to each lane0
-     *     int aggregate = WarpReduce(temp_storage).TailSegmentedReduce(
-     *         thread_data, tail_flag, cub::Max());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data and \p tail_flag across the block of threads
-     * is <tt>{0, 1, 2, 3, ..., 31</tt> and is <tt>{0, 0, 0, 1, 0, 0, 0, 1, ..., 0, 0, 0, 1</tt>,
-     * respectively.  The corresponding output \p aggregate in threads 0, 4, 8, etc. will be
-     * \p 3, \p 7, \p 11, etc. (and is undefined in other threads).
-     *
-     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <
-        typename            ReductionOp,
-        typename            FlagT>
-    __device__ __forceinline__ T TailSegmentedReduce(
-        T                   input,              ///< [in] Calling thread's input
-        FlagT                tail_flag,          ///< [in] Tail flag denoting whether or not \p input is the end of the current segment
-        ReductionOp         reduction_op)       ///< [in] Reduction operator
-    {
-        return InternalWarpReduce(temp_storage).template SegmentedReduce<false>(input, tail_flag, reduction_op);
-    }
-
-
-
-    //@}  end member group
-};
-
-/** @} */       // end group WarpModule
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/system/cuda/detail/cub/warp/warp_scan.cuh b/thrust/system/cuda/detail/cub/warp/warp_scan.cuh
deleted file mode 100644
index aa7149586..000000000
--- a/thrust/system/cuda/detail/cub/warp/warp_scan.cuh
+++ /dev/null
@@ -1,936 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * The cub::WarpScan class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel prefix scan of items partitioned across a CUDA thread warp.
- */
-
-#pragma once
-
-#include "specializations/warp_scan_shfl.cuh"
-#include "specializations/warp_scan_smem.cuh"
-#include "../thread/thread_operators.cuh"
-#include "../util_arch.cuh"
-#include "../util_type.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-THRUST_CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \addtogroup WarpModule
- * @{
- */
-
-/**
- * \brief The WarpScan class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel prefix scan of items partitioned across a CUDA thread warp.  ![](warp_scan_logo.png)
- *
- * \tparam T                        The scan input/output element type
- * \tparam LOGICAL_WARP_THREADS     <b>[optional]</b> The number of threads per "logical" warp (may be less than the number of hardware warp threads).  Default is the warp size associated with the CUDA Compute Capability targeted by the compiler (e.g., 32 threads for SM20).
- * \tparam PTX_ARCH                 <b>[optional]</b> \ptxversion
- *
- * \par Overview
- * - Given a list of input elements and a binary reduction operator, a [<em>prefix scan</em>](http://en.wikipedia.org/wiki/Prefix_sum)
- *   produces an output list where each element is computed to be the reduction
- *   of the elements occurring earlier in the input list.  <em>Prefix sum</em>
- *   connotes a prefix scan with the addition operator. The term \em inclusive indicates
- *   that the <em>i</em><sup>th</sup> output reduction incorporates the <em>i</em><sup>th</sup> input.
- *   The term \em exclusive indicates the <em>i</em><sup>th</sup> input is not incorporated into
- *   the <em>i</em><sup>th</sup> output reduction.
- * - Supports non-commutative scan operators
- * - Supports "logical" warps smaller than the physical warp size (e.g., a logical warp of 8 threads)
- * - The number of entrant threads must be an multiple of \p LOGICAL_WARP_THREADS
- *
- * \par Performance Considerations
- * - Uses special instructions when applicable (e.g., warp \p SHFL)
- * - Uses synchronization-free communication between warp lanes when applicable
- * - Incurs zero bank conflicts for most types
- * - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
- *     - Summation (<b><em>vs.</em></b> generic scan)
- *     - The architecture's warp size is a whole multiple of \p LOGICAL_WARP_THREADS
- *
- * \par Simple Examples
- * \warpcollective{WarpScan}
- * \par
- * The code snippet below illustrates four concurrent warp prefix sums within a block of
- * 128 threads (one per each of the 32-thread warps).
- * \par
- * \code
- * #include <cub/cub.cuh>
- *
- * __global__ void ExampleKernel(...)
- * {
- *     // Specialize WarpScan for type int
- *     typedef cub::WarpScan<int> WarpScan;
- *
- *     // Allocate WarpScan shared memory for 4 warps
- *     __shared__ typename WarpScan::TempStorage temp_storage[4];
- *
- *     // Obtain one input item per thread
- *     int thread_data = ...
- *
- *     // Compute warp-wide prefix sums
- *     int warp_id = threadIdx.x / 32;
- *     WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data);
- *
- * \endcode
- * \par
- * Suppose the set of input \p thread_data across the block of threads is <tt>{1, 1, 1, 1, ...}</tt>.
- * The corresponding output \p thread_data in each of the four warps of threads will be
- * <tt>0, 1, 2, 3, ..., 31}</tt>.
- *
- * \par
- * The code snippet below illustrates a single warp prefix sum within a block of
- * 128 threads.
- * \par
- * \code
- * #include <cub/cub.cuh>
- *
- * __global__ void ExampleKernel(...)
- * {
- *     // Specialize WarpScan for type int
- *     typedef cub::WarpScan<int> WarpScan;
- *
- *     // Allocate WarpScan shared memory for one warp
- *     __shared__ typename WarpScan::TempStorage temp_storage;
- *     ...
- *
- *     // Only the first warp performs a prefix sum
- *     if (threadIdx.x < 32)
- *     {
- *         // Obtain one input item per thread
- *         int thread_data = ...
- *
- *         // Compute warp-wide prefix sums
- *         WarpScan(temp_storage).ExclusiveSum(thread_data, thread_data);
- *
- * \endcode
- * \par
- * Suppose the set of input \p thread_data across the warp of threads is <tt>{1, 1, 1, 1, ...}</tt>.
- * The corresponding output \p thread_data will be <tt>{0, 1, 2, 3, ..., 31}</tt>.
- *
- */
-template <
-    typename    T,
-    int         LOGICAL_WARP_THREADS    = CUB_PTX_WARP_THREADS,
-    int         PTX_ARCH                = CUB_PTX_ARCH>
-class WarpScan
-{
-private:
-
-    /******************************************************************************
-     * Constants and type definitions
-     ******************************************************************************/
-
-    enum
-    {
-        /// Whether the logical warp size and the PTX warp size coincide
-        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
-
-        /// Whether the logical warp size is a power-of-two
-        IS_POW_OF_TWO = ((LOGICAL_WARP_THREADS & (LOGICAL_WARP_THREADS - 1)) == 0),
-
-        /// Whether the data type is an integer (which has fully-associative addition)
-        IS_INTEGER = ((Traits<T>::CATEGORY == SIGNED_INTEGER) || (Traits<T>::CATEGORY == UNSIGNED_INTEGER))
-    };
-
-    /// Internal specialization.  Use SHFL-based scan if (architecture is >= SM30) and (LOGICAL_WARP_THREADS is a power-of-two)
-    typedef typename If<(PTX_ARCH >= 300) && (IS_POW_OF_TWO),
-        WarpScanShfl<T, LOGICAL_WARP_THREADS, PTX_ARCH>,
-        WarpScanSmem<T, LOGICAL_WARP_THREADS, PTX_ARCH> >::Type InternalWarpScan;
-
-    /// Shared memory storage layout type for WarpScan
-    typedef typename InternalWarpScan::TempStorage _TempStorage;
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    /// Shared storage reference
-    _TempStorage    &temp_storage;
-    unsigned int    lane_id;
-
-
-
-    /******************************************************************************
-     * Public types
-     ******************************************************************************/
-
-public:
-
-    /// \smemstorage{WarpScan}
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    /******************************************************************//**
-     * \name Collective constructors
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.  Logical warp and lane identifiers are constructed from <tt>threadIdx.x</tt>.
-     */
-    __device__ __forceinline__ WarpScan(
-        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias()),
-        lane_id(IS_ARCH_WARP ?
-            LaneId() :
-            LaneId() % LOGICAL_WARP_THREADS)
-    {}
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Inclusive prefix sums
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes an inclusive prefix sum across the calling warp.
-     *
-     * \par
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates four concurrent warp-wide inclusive prefix sums within a block of
-     * 128 threads (one per each of the 32-thread warps).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpScan for type int
-     *     typedef cub::WarpScan<int> WarpScan;
-     *
-     *     // Allocate WarpScan shared memory for 4 warps
-     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
-     *
-     *     // Obtain one input item per thread
-     *     int thread_data = ...
-     *
-     *     // Compute inclusive warp-wide prefix sums
-     *     int warp_id = threadIdx.x / 32;
-     *     WarpScan(temp_storage[warp_id]).InclusiveSum(thread_data, thread_data);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{1, 1, 1, 1, ...}</tt>.
-     * The corresponding output \p thread_data in each of the four warps of threads will be
-     * <tt>1, 2, 3, ..., 32}</tt>.
-     */
-    __device__ __forceinline__ void InclusiveSum(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &inclusive_output)  ///< [out] Calling thread's output item.  May be aliased with \p input.
-    {
-        InclusiveScan(input, inclusive_output, cub::Sum());
-    }
-
-
-    /**
-     * \brief Computes an inclusive prefix sum across the calling warp.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
-     *
-     * \par
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates four concurrent warp-wide inclusive prefix sums within a block of
-     * 128 threads (one per each of the 32-thread warps).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpScan for type int
-     *     typedef cub::WarpScan<int> WarpScan;
-     *
-     *     // Allocate WarpScan shared memory for 4 warps
-     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
-     *
-     *     // Obtain one input item per thread
-     *     int thread_data = ...
-     *
-     *     // Compute inclusive warp-wide prefix sums
-     *     int warp_aggregate;
-     *     int warp_id = threadIdx.x / 32;
-     *     WarpScan(temp_storage[warp_id]).InclusiveSum(thread_data, thread_data, warp_aggregate);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{1, 1, 1, 1, ...}</tt>.
-     * The corresponding output \p thread_data in each of the four warps of threads will be
-     * <tt>1, 2, 3, ..., 32}</tt>.  Furthermore, \p warp_aggregate for all threads in all warps will be \p 32.
-     */
-    __device__ __forceinline__ void InclusiveSum(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
-        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
-    {
-        InclusiveScan(input, inclusive_output, cub::Sum(), warp_aggregate);
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Exclusive prefix sums
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes an exclusive prefix sum across the calling warp.  The value of 0 is applied as the initial value, and is assigned to \p exclusive_output in <em>thread</em><sub>0</sub>.
-     *
-     * \par
-     *  - \identityzero
-     *  - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates four concurrent warp-wide exclusive prefix sums within a block of
-     * 128 threads (one per each of the 32-thread warps).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpScan for type int
-     *     typedef cub::WarpScan<int> WarpScan;
-     *
-     *     // Allocate WarpScan shared memory for 4 warps
-     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
-     *
-     *     // Obtain one input item per thread
-     *     int thread_data = ...
-     *
-     *     // Compute exclusive warp-wide prefix sums
-     *     int warp_id = threadIdx.x / 32;
-     *     WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{1, 1, 1, 1, ...}</tt>.
-     * The corresponding output \p thread_data in each of the four warps of threads will be
-     * <tt>0, 1, 2, ..., 31}</tt>.
-     *
-     */
-    __device__ __forceinline__ void ExclusiveSum(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &exclusive_output)  ///< [out] Calling thread's output item.  May be aliased with \p input.
-    {
-        T initial_value = 0;
-        ExclusiveScan(input, exclusive_output, initial_value, cub::Sum());
-    }
-
-
-    /**
-     * \brief Computes an exclusive prefix sum across the calling warp.  The value of 0 is applied as the initial value, and is assigned to \p exclusive_output in <em>thread</em><sub>0</sub>.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
-     *
-     * \par
-     *  - \identityzero
-     *  - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates four concurrent warp-wide exclusive prefix sums within a block of
-     * 128 threads (one per each of the 32-thread warps).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpScan for type int
-     *     typedef cub::WarpScan<int> WarpScan;
-     *
-     *     // Allocate WarpScan shared memory for 4 warps
-     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
-     *
-     *     // Obtain one input item per thread
-     *     int thread_data = ...
-     *
-     *     // Compute exclusive warp-wide prefix sums
-     *     int warp_aggregate;
-     *     int warp_id = threadIdx.x / 32;
-     *     WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data, warp_aggregate);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{1, 1, 1, 1, ...}</tt>.
-     * The corresponding output \p thread_data in each of the four warps of threads will be
-     * <tt>0, 1, 2, ..., 31}</tt>.  Furthermore, \p warp_aggregate for all threads in all warps will be \p 32.
-     */
-    __device__ __forceinline__ void ExclusiveSum(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &exclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
-        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
-    {
-        T initial_value = 0;
-        ExclusiveScan(input, exclusive_output, initial_value, cub::Sum(), warp_aggregate);
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Inclusive prefix scans
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Computes an inclusive prefix scan using the specified binary scan functor across the calling warp.
-     *
-     * \par
-     *  - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates four concurrent warp-wide inclusive prefix max scans within a block of
-     * 128 threads (one per each of the 32-thread warps).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpScan for type int
-     *     typedef cub::WarpScan<int> WarpScan;
-     *
-     *     // Allocate WarpScan shared memory for 4 warps
-     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
-     *
-     *     // Obtain one input item per thread
-     *     int thread_data = ...
-     *
-     *     // Compute inclusive warp-wide prefix max scans
-     *     int warp_id = threadIdx.x / 32;
-     *     WarpScan(temp_storage[warp_id]).InclusiveScan(thread_data, thread_data, cub::Max());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
-     * The corresponding output \p thread_data in the first warp would be
-     * <tt>0, 0, 2, 2, ..., 30, 30</tt>, the output for the second warp would be <tt>32, 32, 34, 34, ..., 62, 62</tt>, etc.
-     *
-     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
-        ScanOp          scan_op)            ///< [in] Binary scan operator
-    {
-        InternalWarpScan(temp_storage).InclusiveScan(input, inclusive_output, scan_op);
-    }
-
-
-    /**
-     * \brief Computes an inclusive prefix scan using the specified binary scan functor across the calling warp.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
-     *
-     * \par
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates four concurrent warp-wide inclusive prefix max scans within a block of
-     * 128 threads (one per each of the 32-thread warps).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpScan for type int
-     *     typedef cub::WarpScan<int> WarpScan;
-     *
-     *     // Allocate WarpScan shared memory for 4 warps
-     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
-     *
-     *     // Obtain one input item per thread
-     *     int thread_data = ...
-     *
-     *     // Compute inclusive warp-wide prefix max scans
-     *     int warp_aggregate;
-     *     int warp_id = threadIdx.x / 32;
-     *     WarpScan(temp_storage[warp_id]).InclusiveScan(
-     *         thread_data, thread_data, cub::Max(), warp_aggregate);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
-     * The corresponding output \p thread_data in the first warp would be
-     * <tt>0, 0, 2, 2, ..., 30, 30</tt>, the output for the second warp would be <tt>32, 32, 34, 34, ..., 62, 62</tt>, etc.
-     * Furthermore, \p warp_aggregate would be assigned \p 30 for threads in the first warp, \p 62 for threads
-     * in the second warp, etc.
-     *
-     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
-    {
-        InternalWarpScan(temp_storage).InclusiveScan(input, inclusive_output, scan_op, warp_aggregate);
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Exclusive prefix scans
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp.  Because no initial value is supplied, the \p output computed for <em>warp-lane</em><sub>0</sub> is undefined.
-     *
-     * \par
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of
-     * 128 threads (one per each of the 32-thread warps).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpScan for type int
-     *     typedef cub::WarpScan<int> WarpScan;
-     *
-     *     // Allocate WarpScan shared memory for 4 warps
-     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
-     *
-     *     // Obtain one input item per thread
-     *     int thread_data = ...
-     *
-     *     // Compute exclusive warp-wide prefix max scans
-     *     int warp_id = threadIdx.x / 32;
-     *     WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, cub::Max());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
-     * The corresponding output \p thread_data in the first warp would be
-     * <tt>?, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>?, 32, 32, 34, ..., 60, 62</tt>, etc.
-     * (The output \p thread_data in warp lane<sub>0</sub> is undefined.)
-     *
-     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &exclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
-        ScanOp          scan_op)            ///< [in] Binary scan operator
-    {
-        InternalWarpScan internal(temp_storage);
-
-        T inclusive_output;
-        internal.InclusiveScan(input, inclusive_output, scan_op);
-
-        internal.Update(
-            input,
-            inclusive_output,
-            exclusive_output,
-            scan_op,
-            Int2Type<IS_INTEGER>());
-    }
-
-
-    /**
-     * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp.
-     *
-     * \par
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of
-     * 128 threads (one per each of the 32-thread warps).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpScan for type int
-     *     typedef cub::WarpScan<int> WarpScan;
-     *
-     *     // Allocate WarpScan shared memory for 4 warps
-     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
-     *
-     *     // Obtain one input item per thread
-     *     int thread_data = ...
-     *
-     *     // Compute exclusive warp-wide prefix max scans
-     *     int warp_id = threadIdx.x / 32;
-     *     WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
-     * The corresponding output \p thread_data in the first warp would be
-     * <tt>INT_MIN, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>30, 32, 32, 34, ..., 60, 62</tt>, etc.
-     *
-     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &exclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
-        T               initial_value,      ///< [in] Initial value to seed the exclusive scan
-        ScanOp          scan_op)            ///< [in] Binary scan operator
-    {
-        InternalWarpScan internal(temp_storage);
-
-        T inclusive_output;
-        internal.InclusiveScan(input, inclusive_output, scan_op);
-
-        internal.Update(
-            input,
-            inclusive_output,
-            exclusive_output,
-            scan_op,
-            initial_value,
-            Int2Type<IS_INTEGER>());
-    }
-
-
-    /**
-     * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp.  Because no initial value is supplied, the \p output computed for <em>warp-lane</em><sub>0</sub> is undefined.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
-     *
-     * \par
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of
-     * 128 threads (one per each of the 32-thread warps).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpScan for type int
-     *     typedef cub::WarpScan<int> WarpScan;
-     *
-     *     // Allocate WarpScan shared memory for 4 warps
-     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
-     *
-     *     // Obtain one input item per thread
-     *     int thread_data = ...
-     *
-     *     // Compute exclusive warp-wide prefix max scans
-     *     int warp_aggregate;
-     *     int warp_id = threadIdx.x / 32;
-     *     WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, cub::Max(), warp_aggregate);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
-     * The corresponding output \p thread_data in the first warp would be
-     * <tt>?, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>?, 32, 32, 34, ..., 60, 62</tt>, etc.
-     * (The output \p thread_data in warp lane<sub>0</sub> is undefined.)  Furthermore, \p warp_aggregate would be assigned \p 30 for threads in the first warp, \p 62 for threads
-     * in the second warp, etc.
-     *
-     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &exclusive_output,   ///< [out] Calling thread's output item.  May be aliased with \p input.
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
-    {
-        InternalWarpScan internal(temp_storage);
-
-        T inclusive_output;
-        internal.InclusiveScan(input, inclusive_output, scan_op);
-
-        internal.Update(
-            input,
-            inclusive_output,
-            exclusive_output,
-            warp_aggregate,
-            scan_op,
-            Int2Type<IS_INTEGER>());
-    }
-
-
-    /**
-     * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
-     *
-     * \par
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of
-     * 128 threads (one per each of the 32-thread warps).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpScan for type int
-     *     typedef cub::WarpScan<int> WarpScan;
-     *
-     *     // Allocate WarpScan shared memory for 4 warps
-     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
-     *
-     *     // Obtain one input item per thread
-     *     int thread_data = ...
-     *
-     *     // Compute exclusive warp-wide prefix max scans
-     *     int warp_aggregate;
-     *     int warp_id = threadIdx.x / 32;
-     *     WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), warp_aggregate);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
-     * The corresponding output \p thread_data in the first warp would be
-     * <tt>INT_MIN, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>30, 32, 32, 34, ..., 60, 62</tt>, etc.
-     * Furthermore, \p warp_aggregate would be assigned \p 30 for threads in the first warp, \p 62 for threads
-     * in the second warp, etc.
-     *
-     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &exclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
-        T               initial_value,      ///< [in] Initial value to seed the exclusive scan
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
-    {
-        InternalWarpScan internal(temp_storage);
-
-        T inclusive_output;
-        internal.InclusiveScan(input, inclusive_output, scan_op);
-
-        internal.Update(
-            input,
-            inclusive_output,
-            exclusive_output,
-            warp_aggregate,
-            scan_op,
-            initial_value,
-            Int2Type<IS_INTEGER>());
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Combination (inclusive & exclusive) prefix scans
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes both inclusive and exclusive prefix scans using the specified binary scan functor across the calling warp.  Because no initial value is supplied, the \p exclusive_output computed for <em>warp-lane</em><sub>0</sub> is undefined.
-     *
-     * \par
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of
-     * 128 threads (one per each of the 32-thread warps).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpScan for type int
-     *     typedef cub::WarpScan<int> WarpScan;
-     *
-     *     // Allocate WarpScan shared memory for 4 warps
-     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
-     *
-     *     // Obtain one input item per thread
-     *     int thread_data = ...
-     *
-     *     // Compute exclusive warp-wide prefix max scans
-     *     int inclusive_partial, exclusive_partial;
-     *     WarpScan(temp_storage[warp_id]).Scan(thread_data, inclusive_partial, exclusive_partial, cub::Max());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
-     * The corresponding output \p inclusive_partial in the first warp would be
-     * <tt>0, 0, 2, 2, ..., 30, 30</tt>, the output for the second warp would be <tt>32, 32, 34, 34, ..., 62, 62</tt>, etc.
-     * The corresponding output \p exclusive_partial in the first warp would be
-     * <tt>?, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>?, 32, 32, 34, ..., 60, 62</tt>, etc.
-     * (The output \p thread_data in warp lane<sub>0</sub> is undefined.)
-     *
-     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ScanOp>
-    __device__ __forceinline__ void Scan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &inclusive_output,  ///< [out] Calling thread's inclusive-scan output item.
-        T               &exclusive_output,  ///< [out] Calling thread's exclusive-scan output item.
-        ScanOp          scan_op)            ///< [in] Binary scan operator
-    {
-        InternalWarpScan internal(temp_storage);
-
-        internal.InclusiveScan(input, inclusive_output, scan_op);
-
-        internal.Update(
-            input,
-            inclusive_output,
-            exclusive_output,
-            scan_op,
-            Int2Type<IS_INTEGER>());
-    }
-
-
-    /**
-     * \brief Computes both inclusive and exclusive prefix scans using the specified binary scan functor across the calling warp.
-     *
-     * \par
-     *  - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates four concurrent warp-wide prefix max scans within a block of
-     * 128 threads (one per each of the 32-thread warps).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpScan for type int
-     *     typedef cub::WarpScan<int> WarpScan;
-     *
-     *     // Allocate WarpScan shared memory for 4 warps
-     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
-     *
-     *     // Obtain one input item per thread
-     *     int thread_data = ...
-     *
-     *     // Compute inclusive warp-wide prefix max scans
-     *     int warp_id = threadIdx.x / 32;
-     *     int inclusive_partial, exclusive_partial;
-     *     WarpScan(temp_storage[warp_id]).Scan(thread_data, inclusive_partial, exclusive_partial, INT_MIN, cub::Max());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
-     * The corresponding output \p inclusive_partial in the first warp would be
-     * <tt>0, 0, 2, 2, ..., 30, 30</tt>, the output for the second warp would be <tt>32, 32, 34, 34, ..., 62, 62</tt>, etc.
-     * The corresponding output \p exclusive_partial in the first warp would be
-     * <tt>INT_MIN, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>30, 32, 32, 34, ..., 60, 62</tt>, etc.
-     *
-     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ScanOp>
-    __device__ __forceinline__ void Scan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &inclusive_output,  ///< [out] Calling thread's inclusive-scan output item.
-        T               &exclusive_output,  ///< [out] Calling thread's exclusive-scan output item.
-        T               initial_value,      ///< [in] Initial value to seed the exclusive scan
-        ScanOp          scan_op)            ///< [in] Binary scan operator
-    {
-        InternalWarpScan internal(temp_storage);
-
-        internal.InclusiveScan(input, inclusive_output, scan_op);
-
-        internal.Update(
-            input,
-            inclusive_output,
-            exclusive_output,
-            scan_op,
-            initial_value,
-            Int2Type<IS_INTEGER>());
-    }
-
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Data exchange
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Broadcast the value \p input from <em>warp-lane</em><sub><tt>src_lane</tt></sub> to all lanes in the warp
-     *
-     * \par
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates the warp-wide broadcasts of values from
-     * lanes<sub>0</sub> in each of four warps to all other threads in those warps.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpScan for type int
-     *     typedef cub::WarpScan<int> WarpScan;
-     *
-     *     // Allocate WarpScan shared memory for 4 warps
-     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
-     *
-     *     // Obtain one input item per thread
-     *     int thread_data = ...
-     *
-     *     // Broadcast from lane0 in each warp to all other threads in the warp
-     *     int warp_id = threadIdx.x / 32;
-     *     thread_data = WarpScan(temp_storage[warp_id]).Broadcast(thread_data, 0);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, 1, 2, 3, ..., 127}</tt>.
-     * The corresponding output \p thread_data will be
-     * <tt>{0, 0, ..., 0}</tt> in warp<sub>0</sub>,
-     * <tt>{32, 32, ..., 32}</tt> in warp<sub>1</sub>,
-     * <tt>{64, 64, ..., 64}</tt> in warp<sub>2</sub>, etc.
-     */
-    __device__ __forceinline__ T Broadcast(
-        T               input,              ///< [in] The value to broadcast
-        unsigned int    src_lane)           ///< [in] Which warp lane is to do the broadcasting
-    {
-        return InternalWarpScan(temp_storage).Broadcast(input, src_lane);
-    }
-
-    //@}  end member group
-
-};
-
-/** @} */       // end group WarpModule
-
-}               // CUB namespace
-THRUST_CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/system/cuda/detail/dispatch.h b/thrust/system/cuda/detail/dispatch.h
new file mode 100644
index 000000000..d0e3f94ec
--- /dev/null
+++ b/thrust/system/cuda/detail/dispatch.h
@@ -0,0 +1,79 @@
+/*
+ *  Copyright 2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/cstdint.h>
+#include <thrust/detail/preprocessor.h>
+#include <thrust/detail/integer_traits.h>
+
+/**
+ * Dispatch between 32-bit and 64-bit index based versions of the same algorithm
+ * implementation. This version assumes that callables for both branches consist
+ * of the same tokens, and is intended to be used with Thrust-style dispatch
+ * interfaces, that always deduce the size type from the arguments.
+ */
+#define THRUST_INDEX_TYPE_DISPATCH(status, call, count, arguments) \
+    if (count <= thrust::detail::integer_traits<thrust::detail::int32_t>::const_max) { \
+        auto THRUST_PP_CAT2(count, _fixed) = static_cast<thrust::detail::int32_t>(count); \
+        status = call arguments; \
+    } \
+    else { \
+        auto THRUST_PP_CAT2(count, _fixed) = static_cast<thrust::detail::int64_t>(count); \
+        status = call arguments; \
+    }
+
+/**
+ * Dispatch between 32-bit and 64-bit index based versions of the same algorithm
+ * implementation. This version assumes that callables for both branches consist
+ * of the same tokens, and is intended to be used with Thrust-style dispatch
+ * interfaces, that always deduce the size type from the arguments.
+ *
+ * This version of the macro supports providing two count variables, which is
+ * necessary for set algorithms.
+ */
+#define THRUST_DOUBLE_INDEX_TYPE_DISPATCH(status, call, count1, count2, arguments) \
+    if (count1 + count2 <= thrust::detail::integer_traits<thrust::detail::int32_t>::const_max) { \
+        auto THRUST_PP_CAT2(count1, _fixed) = static_cast<thrust::detail::int32_t>(count1); \
+        auto THRUST_PP_CAT2(count2, _fixed) = static_cast<thrust::detail::int32_t>(count2); \
+        status = call arguments; \
+    } \
+    else { \
+        auto THRUST_PP_CAT2(count1, _fixed) = static_cast<thrust::detail::int64_t>(count1); \
+        auto THRUST_PP_CAT2(count2, _fixed) = static_cast<thrust::detail::int64_t>(count2); \
+        status = call arguments; \
+    }
+/**
+ * Dispatch between 32-bit and 64-bit index based versions of the same algorithm
+ * implementation. This version allows using different token sequences for callables
+ * in both branches, and is intended to be used with CUB-style dispatch interfaces,
+ * where the "simple" interface always forces the size to be `int` (making it harder
+ * for us to use), but the complex interface that we end up using doesn't actually
+ * provide a way to fully deduce the type from just the call, making the size type
+ * appear in the token sequence of the callable.
+ *
+ * See reduce_n_impl to see an example of how this is meant to be used.
+ */
+#define THRUST_INDEX_TYPE_DISPATCH2(status, call_32, call_64, count, arguments) \
+    if (count <= thrust::detail::integer_traits<thrust::detail::int32_t>::const_max) { \
+        auto THRUST_PP_CAT2(count, _fixed) = static_cast<thrust::detail::int32_t>(count); \
+        status = call_32 arguments; \
+    } \
+    else { \
+        auto THRUST_PP_CAT2(count, _fixed) = static_cast<thrust::detail::int64_t>(count); \
+        status = call_64 arguments; \
+    }
+
diff --git a/thrust/system/cuda/detail/equal.h b/thrust/system/cuda/detail/equal.h
index 7a995cffd..aec608245 100644
--- a/thrust/system/cuda/detail/equal.h
+++ b/thrust/system/cuda/detail/equal.h
@@ -26,13 +26,14 @@
  ******************************************************************************/
 #pragma once
 
+#include <thrust/detail/config.h>
 
 #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
 #include <thrust/system/cuda/config.h>
 
 #include <thrust/system/cuda/detail/mismatch.h>
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
 namespace cuda_cub {
 
 template <class Derived,
@@ -69,5 +70,5 @@ equal(execution_policy<Derived>& policy,
 
 
 } // namespace cuda_cub
-THRUST_END_NS
+THRUST_NAMESPACE_END
 #endif
diff --git a/thrust/system/cuda/detail/error.inl b/thrust/system/cuda/detail/error.inl
index 5c689b499..e52305211 100644
--- a/thrust/system/cuda/detail/error.inl
+++ b/thrust/system/cuda/detail/error.inl
@@ -17,11 +17,12 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/system/cuda/error.h>
 #include <thrust/system/cuda/detail/guarded_cuda_runtime_api.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace system
 {
@@ -71,7 +72,7 @@ class cuda_error_category
     {
       using namespace cuda::errc;
 
-      if(ev < ::cudaErrorApiFailureBase)
+      if(ev < ::cudaErrorUnknown)
       {
         return make_error_condition(static_cast<errc_t>(ev));
       }
@@ -94,5 +95,5 @@ const error_category &cuda_category(void)
 
 } // end namespace system
 
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/cuda/detail/execution_policy.h b/thrust/system/cuda/detail/execution_policy.h
index 0b3af62e3..4202424c5 100644
--- a/thrust/system/cuda/detail/execution_policy.h
+++ b/thrust/system/cuda/detail/execution_policy.h
@@ -27,6 +27,8 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/version.h>
 #include <thrust/detail/execution_policy.h>
 #include <thrust/iterator/detail/any_system_tag.h>
@@ -38,7 +40,7 @@
   #include <thrust/detail/dependencies_aware_execution_policy.h>
 #endif
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
 
 namespace cuda_cub
 {
@@ -94,5 +96,5 @@ using thrust::cuda_cub::execution_policy;
 
 } // namespace cuda
 
-THRUST_END_NS
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/cuda/detail/extrema.h b/thrust/system/cuda/detail/extrema.h
index 863700ad9..4fe7ec86b 100644
--- a/thrust/system/cuda/detail/extrema.h
+++ b/thrust/system/cuda/detail/extrema.h
@@ -26,18 +26,22 @@
  ******************************************************************************/
 #pragma once
 
+#include <thrust/detail/config.h>
 
 #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
-#include <thrust/system/cuda/config.h>
-#include <thrust/system/cuda/detail/reduce.h>
 
 #include <thrust/detail/cstdint.h>
 #include <thrust/detail/temporary_array.h>
+#include <thrust/distance.h>
 #include <thrust/extrema.h>
 #include <thrust/pair.h>
-#include <thrust/distance.h>
+#include <thrust/system/cuda/config.h>
+#include <thrust/system/cuda/detail/cdp_dispatch.h>
+#include <thrust/system/cuda/detail/reduce.h>
 
-THRUST_BEGIN_NS
+#include <cub/util_math.cuh>
+
+THRUST_NAMESPACE_BEGIN
 namespace cuda_cub {
 
 namespace __extrema {
@@ -108,7 +112,7 @@ namespace __extrema {
   struct arg_minmax_f
   {
     Predicate predicate;
-    
+
     typedef tuple<InputType, IndexType> pair_type;
     typedef tuple<pair_type, pair_type> two_pairs_type;
 
@@ -127,8 +131,11 @@ namespace __extrema {
       pair_type const &lhs_min = get<0>(lhs);
       pair_type const &rhs_max = get<1>(rhs);
       pair_type const &lhs_max = get<1>(lhs);
-      return make_tuple(arg_min_t(predicate)(lhs_min, rhs_min),
-                        arg_max_t(predicate)(lhs_max, rhs_max));
+
+      auto result = thrust::make_tuple(arg_min_t(predicate)(lhs_min, rhs_min),
+                                       arg_max_t(predicate)(lhs_max, rhs_max));
+
+      return result;
     }
 
     struct duplicate_tuple
@@ -153,14 +160,15 @@ namespace __extrema {
             Size         num_items,
             ReductionOp  reduction_op,
             OutputIt     output_it,
-            cudaStream_t stream,
-            bool         debug_sync)
+            cudaStream_t stream)
   {
     using core::AgentPlan;
     using core::AgentLauncher;
     using core::get_agent_plan;
     using core::cuda_optional;
 
+    typedef typename detail::make_unsigned_special<Size>::type UnsignedSize;
+
     if (num_items == 0)
       return cudaErrorNotSupported;
 
@@ -185,7 +193,7 @@ namespace __extrema {
       }
       char *vshmem_ptr = vshmem_size > 0 ? (char*)d_temp_storage : NULL;
 
-      reduce_agent ra(reduce_plan, num_items, stream, vshmem_ptr, "reduce_agent: single_tile only", debug_sync);
+      reduce_agent ra(reduce_plan, num_items, stream, vshmem_ptr, "reduce_agent: single_tile only");
       ra.launch(input_it, output_it, num_items, reduction_op);
       CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
     }
@@ -195,16 +203,14 @@ namespace __extrema {
       cuda_optional<int> sm_count = core::get_sm_count();
       CUDA_CUB_RET_IF_FAIL(sm_count.status());
 
-      typedef __reduce::GridSizeType GridSizeType;
-
       // reduction will not use more cta counts than requested
       cuda_optional<int> max_blocks_per_sm =
           reduce_agent::
               template get_max_blocks_per_sm<InputIt,
                                              OutputIt,
                                              Size,
-                                             cub::GridEvenShare<GridSizeType>,
-                                             cub::GridQueue<GridSizeType>,
+                                             cub::GridEvenShare<Size>,
+                                             cub::GridQueue<UnsignedSize>,
                                              ReductionOp>(reduce_plan);
       CUDA_CUB_RET_IF_FAIL(max_blocks_per_sm.status());
 
@@ -215,8 +221,8 @@ namespace __extrema {
       int sm_oversubscription = 5;
       int max_blocks          = reduce_device_occupancy * sm_oversubscription;
 
-      cub::GridEvenShare<GridSizeType> even_share;
-      even_share.DispatchInit(static_cast<int>(num_items), max_blocks,
+      cub::GridEvenShare<Size> even_share;
+      even_share.DispatchInit(num_items, max_blocks,
                               reduce_plan.items_per_tile);
 
       // we will launch at most "max_blocks" blocks in a grid
@@ -230,7 +236,7 @@ namespace __extrema {
       size_t allocation_sizes[3] =
           {
               max_blocks * sizeof(T),                            // bytes needed for privatized block reductions
-              cub::GridQueue<GridSizeType>::AllocationSize(),    // bytes needed for grid queue descriptor0
+              cub::GridQueue<UnsignedSize>::AllocationSize(),    // bytes needed for grid queue descriptor0
               vshmem_size                                        // size of virtualized shared memory storage
           };
       status = cub::AliasTemporaries(d_temp_storage,
@@ -244,7 +250,7 @@ namespace __extrema {
       }
 
       T *d_block_reductions = (T*) allocations[0];
-      cub::GridQueue<GridSizeType> queue(allocations[1]);
+      cub::GridQueue<UnsignedSize> queue(allocations[1]);
       char *vshmem_ptr = vshmem_size > 0 ? (char *)allocations[2] : NULL;
 
 
@@ -258,17 +264,16 @@ namespace __extrema {
       else if (reduce_plan.grid_mapping == cub::GRID_MAPPING_DYNAMIC)
       {
         // Work is distributed dynamically
-        size_t num_tiles = (num_items + reduce_plan.items_per_tile - 1) /
-          reduce_plan.items_per_tile;
+        size_t num_tiles = cub::DivideAndRoundUp(num_items, reduce_plan.items_per_tile);
 
         // if not enough to fill the device with threadblocks
         // then fill the device with threadblocks
-        reduce_grid_size = static_cast<int>(min(num_tiles, static_cast<size_t>(reduce_device_occupancy)));
+        reduce_grid_size = static_cast<int>((min)(num_tiles, static_cast<size_t>(reduce_device_occupancy)));
 
         typedef AgentLauncher<__reduce::DrainAgent<Size> > drain_agent;
         AgentPlan drain_plan = drain_agent::get_plan();
         drain_plan.grid_size = 1;
-        drain_agent da(drain_plan, stream, "__reduce::drain_agent", debug_sync);
+        drain_agent da(drain_plan, stream, "__reduce::drain_agent");
         da.launch(queue, num_items);
         CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
       }
@@ -278,7 +283,7 @@ namespace __extrema {
       }
 
       reduce_plan.grid_size = reduce_grid_size;
-      reduce_agent ra(reduce_plan, stream, vshmem_ptr, "reduce_agent: regular size reduce", debug_sync);
+      reduce_agent ra(reduce_plan, stream, vshmem_ptr, "reduce_agent: regular size reduce");
       ra.launch(input_it,
                 d_block_reductions,
                 num_items,
@@ -293,7 +298,7 @@ namespace __extrema {
         reduce_agent_single;
 
       reduce_plan.grid_size = 1;
-      reduce_agent_single ra1(reduce_plan, stream, vshmem_ptr, "reduce_agent: single tile reduce", debug_sync);
+      reduce_agent_single ra1(reduce_plan, stream, vshmem_ptr, "reduce_agent: single tile reduce");
 
       ra1.launch(d_block_reductions, output_it, reduce_grid_size, reduction_op);
       CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
@@ -318,17 +323,11 @@ namespace __extrema {
   {
     size_t       temp_storage_bytes = 0;
     cudaStream_t stream             = cuda_cub::stream(policy);
-    bool         debug_sync         = THRUST_DEBUG_SYNC_FLAG;
 
     cudaError_t status;
-    status = doit_step<T>(NULL,
-                          temp_storage_bytes,
-                          first,
-                          num_items,
-                          binary_op,
-                          reinterpret_cast<T*>(NULL),
-                          stream,
-                          debug_sync);
+    THRUST_INDEX_TYPE_DISPATCH(status, doit_step<T>, num_items,
+        (NULL, temp_storage_bytes, first, num_items_fixed,
+            binary_op, reinterpret_cast<T*>(NULL), stream));
     cuda_cub::throw_on_error(status, "extrema failed on 1st step");
 
     size_t allocation_sizes[2] = {sizeof(T*), temp_storage_bytes};
@@ -345,7 +344,7 @@ namespace __extrema {
     thrust::detail::temporary_array<thrust::detail::uint8_t, Derived>
       tmp(policy, storage_size);
     void *ptr = static_cast<void*>(tmp.data().get());
-    
+
     status = core::alias_storage(ptr,
                                  storage_size,
                                  allocations,
@@ -354,16 +353,11 @@ namespace __extrema {
 
     T* d_result = thrust::detail::aligned_reinterpret_cast<T*>(allocations[0]);
 
-    status = doit_step<T>(allocations[1],
-                          temp_storage_bytes,
-                          first,
-                          num_items,
-                          binary_op,
-                          d_result,
-                          stream,
-                          debug_sync);
+    THRUST_INDEX_TYPE_DISPATCH(status, doit_step<T>, num_items,
+        (allocations[1], temp_storage_bytes, first,
+            num_items_fixed, binary_op, d_result, stream));
     cuda_cub::throw_on_error(status, "extrema failed on 2nd step");
-    
+
     status = cuda_cub::synchronize(policy);
     cuda_cub::throw_on_error(status, "extrema failed to synchronize");
 
@@ -376,7 +370,7 @@ namespace __extrema {
             class Derived,
             class ItemsIt,
             class BinaryPred>
-  ItemsIt CUB_RUNTIME_FUNCTION
+  ItemsIt THRUST_RUNTIME_FUNCTION
   element(execution_policy<Derived> &policy,
           ItemsIt                    first,
           ItemsIt                    last,
@@ -393,7 +387,7 @@ namespace __extrema {
     typedef tuple<ItemsIt, counting_iterator_t<IndexType> > iterator_tuple;
     typedef zip_iterator<iterator_tuple> zip_iterator;
 
-    iterator_tuple iter_tuple = make_tuple(first, counting_iterator_t<IndexType>(0));
+    iterator_tuple iter_tuple = thrust::make_tuple(first, counting_iterator_t<IndexType>(0));
 
 
     typedef ArgFunctor<InputType, IndexType, BinaryPred> arg_min_t;
@@ -424,24 +418,16 @@ min_element(execution_policy<Derived> &policy,
             ItemsIt                    last,
             BinaryPred                 binary_pred)
 {
-  ItemsIt ret = first;
-  if (__THRUST_HAS_CUDART__)
-  {
-    ret = __extrema::element<__extrema::arg_min_f>(policy,
-                                                   first,
-                                                   last,
-                                                   binary_pred);
-  }
-  else
-  {
-#if !__THRUST_HAS_CUDART__
-    ret = thrust::min_element(cvt_to_seq(derived_cast(policy)),
-                              first,
-                              last,
-                              binary_pred);
-#endif
-  }
-  return ret;
+  THRUST_CDP_DISPATCH(
+    (last = __extrema::element<__extrema::arg_min_f>(policy,
+                                                     first,
+                                                     last,
+                                                     binary_pred);),
+    (last = thrust::min_element(cvt_to_seq(derived_cast(policy)),
+                                first,
+                                last,
+                                binary_pred);));
+  return last;
 }
 
 template <class Derived,
@@ -467,24 +453,16 @@ max_element(execution_policy<Derived> &policy,
             ItemsIt                    last,
             BinaryPred                 binary_pred)
 {
-  ItemsIt ret = first;
-  if (__THRUST_HAS_CUDART__)
-  {
-    ret = __extrema::element<__extrema::arg_max_f>(policy,
-                                                   first,
-                                                   last,
-                                                   binary_pred);
-  }
-  else
-  {
-#if !__THRUST_HAS_CUDART__
-    ret = thrust::max_element(cvt_to_seq(derived_cast(policy)),
-                              first,
-                              last,
-                              binary_pred);
-#endif
-  }
-  return ret;
+  THRUST_CDP_DISPATCH(
+    (last = __extrema::element<__extrema::arg_max_f>(policy,
+                                                     first,
+                                                     last,
+                                                     binary_pred);),
+    (last = thrust::max_element(cvt_to_seq(derived_cast(policy)),
+                                first,
+                                last,
+                                binary_pred);));
+  return last;
 }
 
 template <class Derived,
@@ -510,51 +488,46 @@ minmax_element(execution_policy<Derived> &policy,
                ItemsIt                    last,
                BinaryPred                 binary_pred)
 {
-  pair<ItemsIt, ItemsIt> ret = thrust::make_pair(first, first);
-
-  if (__THRUST_HAS_CUDART__)
+  auto ret = thrust::make_pair(last, last);
+  if (first == last)
   {
-    if (first == last)
-      return thrust::make_pair(last, last);
+    return ret;
+  }
 
-    typedef typename iterator_traits<ItemsIt>::value_type      InputType;
-    typedef typename iterator_traits<ItemsIt>::difference_type IndexType;
+  THRUST_CDP_DISPATCH(
+    (using InputType = typename iterator_traits<ItemsIt>::value_type;
+     using IndexType = typename iterator_traits<ItemsIt>::difference_type;
 
-    IndexType num_items = static_cast<IndexType>(thrust::distance(first, last));
+     const auto num_items =
+       static_cast<IndexType>(thrust::distance(first, last));
 
+     using iterator_tuple = tuple<ItemsIt, counting_iterator_t<IndexType>>;
+     using zip_iterator   = zip_iterator<iterator_tuple>;
 
-    typedef tuple<ItemsIt, counting_iterator_t<IndexType> > iterator_tuple;
-    typedef zip_iterator<iterator_tuple> zip_iterator;
+     iterator_tuple iter_tuple =
+       thrust::make_tuple(first, counting_iterator_t<IndexType>(0));
 
-    iterator_tuple iter_tuple = make_tuple(first, counting_iterator_t<IndexType>(0));
-
-
-    typedef __extrema::arg_minmax_f<InputType, IndexType, BinaryPred> arg_minmax_t;
-    typedef typename arg_minmax_t::two_pairs_type  two_pairs_type;
-    typedef typename arg_minmax_t::duplicate_tuple duplicate_t;
-    typedef transform_input_iterator_t<two_pairs_type,
-                                       zip_iterator,
-                                       duplicate_t>
-        transform_t;
-
-    zip_iterator   begin  = make_zip_iterator(iter_tuple);
-    two_pairs_type result = __extrema::extrema(policy,
-                                               transform_t(begin, duplicate_t()),
-                                               num_items,
-                                               arg_minmax_t(binary_pred),
-                                               (two_pairs_type *)(NULL));
-    ret = thrust::make_pair(first + get<1>(get<0>(result)),
-                    first + get<1>(get<1>(result)));
-  }
-  else
-  {
-#if !__THRUST_HAS_CUDART__
-    ret = thrust::minmax_element(cvt_to_seq(derived_cast(policy)),
-                                 first,
-                                 last,
-                                 binary_pred);
-#endif
-  }
+     using arg_minmax_t =
+       __extrema::arg_minmax_f<InputType, IndexType, BinaryPred>;
+     using two_pairs_type = typename arg_minmax_t::two_pairs_type;
+     using duplicate_t    = typename arg_minmax_t::duplicate_tuple;
+     using transform_t =
+       transform_input_iterator_t<two_pairs_type, zip_iterator, duplicate_t>;
+
+     zip_iterator   begin = make_zip_iterator(iter_tuple);
+     two_pairs_type result =
+       __extrema::extrema(policy,
+                          transform_t(begin, duplicate_t()),
+                          num_items,
+                          arg_minmax_t(binary_pred),
+                          (two_pairs_type *)(NULL));
+     ret = thrust::make_pair(first + get<1>(get<0>(result)),
+                             first + get<1>(get<1>(result)));),
+    // CDP Sequential impl:
+    (ret = thrust::minmax_element(cvt_to_seq(derived_cast(policy)),
+                                  first,
+                                  last,
+                                  binary_pred);));
   return ret;
 }
 
@@ -571,5 +544,5 @@ minmax_element(execution_policy<Derived> &policy,
 
 
 } // namespace cuda_cub
-THRUST_END_NS
+THRUST_NAMESPACE_END
 #endif
diff --git a/thrust/system/cuda/detail/fill.h b/thrust/system/cuda/detail/fill.h
index b5796f399..80ea68592 100644
--- a/thrust/system/cuda/detail/fill.h
+++ b/thrust/system/cuda/detail/fill.h
@@ -26,12 +26,14 @@
  ******************************************************************************/
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
 #include <thrust/system/cuda/detail/util.h>
 #include <thrust/system/cuda/detail/parallel_for.h>
 #include <thrust/distance.h>
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
 namespace cuda_cub {
 
 namespace __fill {
@@ -69,11 +71,6 @@ fill_n(execution_policy<Derived>& policy,
                          value),
                          count);
 
-  cuda_cub::throw_on_error(
-    cuda_cub::synchronize(policy)
-  , "fill_n: failed to synchronize"
-  );
-
   return first + count;
 }    // func fill_n
 
@@ -89,5 +86,5 @@ fill(execution_policy<Derived>& policy,
 
 
 } // namespace cuda_cub
-THRUST_END_NS
+THRUST_NAMESPACE_END
 #endif
diff --git a/thrust/system/cuda/detail/find.h b/thrust/system/cuda/detail/find.h
index 0371c1cf8..b7d2b748f 100644
--- a/thrust/system/cuda/detail/find.h
+++ b/thrust/system/cuda/detail/find.h
@@ -26,6 +26,7 @@
  ******************************************************************************/
 #pragma once
 
+#include <thrust/detail/config.h>
 
 #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
 #include <thrust/system/cuda/config.h>
@@ -34,7 +35,7 @@
 #include <thrust/detail/minmax.h>
 #include <thrust/distance.h>
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
 namespace cuda_cub {
 
 // XXX forward declare to circumvent circular depedency
@@ -66,12 +67,12 @@ find(execution_policy<Derived> &policy,
      T const& value);
 
 }; // namespace cuda_cub
-THRUST_END_NS
+THRUST_NAMESPACE_END
 
 #include <thrust/system/cuda/detail/reduce.h>
 #include <thrust/iterator/zip_iterator.h>
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
 namespace cuda_cub {
 
 namespace __find_if {
@@ -203,13 +204,15 @@ find(execution_policy<Derived> &policy,
      InputIt                    last,
      T const& value)
 {
+  using thrust::placeholders::_1;
+
   return cuda_cub::find_if(policy,
                         first,
                         last,
-                        thrust::detail::equal_to_value<T>(value));
+                        _1 == value);
 }
 
 
 } // namespace cuda_cub
-THRUST_END_NS
+THRUST_NAMESPACE_END
 #endif
diff --git a/thrust/system/cuda/detail/for_each.h b/thrust/system/cuda/detail/for_each.h
index 7a73242ba..518538ff3 100644
--- a/thrust/system/cuda/detail/for_each.h
+++ b/thrust/system/cuda/detail/for_each.h
@@ -26,6 +26,7 @@
  ******************************************************************************/
 #pragma once
 
+#include <thrust/detail/config.h>
 
 #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
 #include <iterator>
@@ -36,7 +37,7 @@
 #include <thrust/detail/function.h>
 #include <thrust/distance.h>
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
 
 namespace cuda_cub {
 
@@ -54,7 +55,7 @@ namespace cuda_cub {
     template <class Size>
     THRUST_DEVICE_FUNCTION void operator()(Size idx)
     {
-      op(raw_reference_cast(input[idx]));
+      op(raw_reference_cast(*(input + idx)));
     }
   };
 
@@ -80,11 +81,6 @@ namespace cuda_cub {
                            for_each_f<Input, wrapped_t>(first, wrapped_op),
                            count);
 
-    cuda_cub::throw_on_error(
-      cuda_cub::synchronize(policy)
-    , "for_each: failed to synchronize"
-    );
-
     return first + count;
   }
 
@@ -104,5 +100,5 @@ namespace cuda_cub {
   }
 }    // namespace cuda_cub
 
-THRUST_END_NS
+THRUST_NAMESPACE_END
 #endif
diff --git a/thrust/system/cuda/detail/future.inl b/thrust/system/cuda/detail/future.inl
index cfcda2cd5..f23184aae 100644
--- a/thrust/system/cuda/detail/future.inl
+++ b/thrust/system/cuda/detail/future.inl
@@ -9,10 +9,9 @@
 #pragma once
 
 #include <thrust/detail/config.h>
-#include <thrust/detail/cpp11_required.h>
-#include <thrust/detail/modern_gcc_required.h>
+#include <thrust/detail/cpp14_required.h>
 
-#if THRUST_CPP_DIALECT >= 2011 && !defined(THRUST_LEGACY_GCC)
+#if THRUST_CPP_DIALECT >= 2014
 
 #include <thrust/optional.h>
 #include <thrust/detail/type_deduction.h>
@@ -30,9 +29,9 @@
 #include <thrust/system/cuda/detail/get_value.h>
 
 #include <type_traits>
-#include <memory>
+#include <thrust/detail/memory_wrapper.h>
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
 
 // Forward declaration.
 struct new_stream_t;
@@ -151,7 +150,7 @@ struct stream_deleter final
 struct stream_conditional_deleter final
 {
 private:
-  bool const cond_;
+  bool cond_;
 
 public:
   __host__
@@ -204,8 +203,13 @@ public:
 
   __thrust_exec_check_disable__
   unique_stream(unique_stream const&) = delete;
+
+  // GCC 10 complains if this is defaulted. See NVIDIA/thrust#1269.
   __thrust_exec_check_disable__
-  unique_stream(unique_stream&&) = default;
+  __host__ unique_stream(unique_stream &&o) noexcept
+    : handle_(std::move(o.handle_))
+  {}
+
   __thrust_exec_check_disable__
   unique_stream& operator=(unique_stream const&) = delete;
   __thrust_exec_check_disable__
@@ -580,8 +584,8 @@ private:
   int device_ = 0;
   pointer content_;
 
-  explicit weak_promise(int device, pointer content)
-    : device_(device), content_(std::move(content))
+  explicit weak_promise(int device_id, pointer content)
+    : device_(device_id), content_(std::move(content))
   {}
 
 public:
@@ -692,9 +696,9 @@ protected:
 
   __host__
   explicit unique_eager_event(
-    int device, std::unique_ptr<detail::async_signal> async_signal
+    int device_id, std::unique_ptr<detail::async_signal> async_signal
   )
-    : device_(device), async_signal_(std::move(async_signal))
+    : device_(device_id), async_signal_(std::move(async_signal))
   {}
 
 public:
@@ -779,7 +783,7 @@ public:
   friend __host__
   optional<detail::unique_stream>
   thrust::system::cuda::detail::try_acquire_stream(
-    int device, unique_eager_event& parent
+    int device_id, unique_eager_event& parent
     ) noexcept;
 
   template <typename... Dependencies>
@@ -807,9 +811,9 @@ private:
 
   __host__
   explicit unique_eager_future(
-    int device, std::unique_ptr<detail::async_value<value_type>> async_signal
+    int device_id, std::unique_ptr<detail::async_value<value_type>> async_signal
   )
-    : device_(device), async_signal_(std::move(async_signal))
+    : device_(device_id), async_signal_(std::move(async_signal))
   {}
 
 public:
@@ -917,7 +921,7 @@ public:
 
     value_type tmp(async_signal_->extract());
     async_signal_.reset();
-    return std::move(tmp);
+    return tmp;
   }
 
   // For testing only.
@@ -937,7 +941,7 @@ public:
   friend __host__
   optional<detail::unique_stream>
   thrust::system::cuda::detail::try_acquire_stream(
-    int device, unique_eager_future<X>& parent
+    int device_id, unique_eager_future<X>& parent
     ) noexcept;
 
   template <
@@ -992,12 +996,12 @@ try_acquire_stream(int, ready_future<X>&) noexcept
 
 __host__
 optional<unique_stream>
-try_acquire_stream(int device, unique_eager_event& parent) noexcept
+try_acquire_stream(int device_id, unique_eager_event& parent) noexcept
 {
   // We have unique ownership, so we can always steal the stream if the future
   // has one as long as they are on the same device as us.
   if (parent.valid_stream())
-    if (device == parent.device_)
+    if (device_id == parent.device_)
       return std::move(parent.async_signal_->stream());
 
   return {};
@@ -1006,12 +1010,12 @@ try_acquire_stream(int device, unique_eager_event& parent) noexcept
 template <typename X>
 __host__
 optional<unique_stream>
-try_acquire_stream(int device, unique_eager_future<X>& parent) noexcept
+try_acquire_stream(int device_id, unique_eager_future<X>& parent) noexcept
 {
   // We have unique ownership, so we can always steal the stream if the future
   // has one as long as they are on the same device as us.
   if (parent.valid_stream())
-    if (device == parent.device_)
+    if (device_id == parent.device_)
       return std::move(parent.async_signal_->stream());
 
   return {};
@@ -1033,27 +1037,27 @@ acquired_stream acquire_stream_impl(
 template <typename... Dependencies, std::size_t I0, std::size_t... Is>
 __host__
 acquired_stream acquire_stream_impl(
-  int device
+  int device_id
 , std::tuple<Dependencies...>& deps, index_sequence<I0, Is...>
 ) noexcept
 {
-  auto tr = try_acquire_stream(device, std::get<I0>(deps));
+  auto tr = try_acquire_stream(device_id, std::get<I0>(deps));
 
   if (tr)
     return {std::move(*tr), {I0}};
   else
-    return acquire_stream_impl(device, deps, index_sequence<Is...>{});
+    return acquire_stream_impl(device_id, deps, index_sequence<Is...>{});
 }
 
 template <typename... Dependencies>
 __host__
 acquired_stream acquire_stream(
-  int device
+  int device_id
 , std::tuple<Dependencies...>& deps
 ) noexcept
 {
   return acquire_stream_impl(
-    device, deps, make_index_sequence<sizeof...(Dependencies)>{}
+    device_id, deps, make_index_sequence<sizeof...(Dependencies)>{}
   );
 }
 
@@ -1266,11 +1270,11 @@ template <typename... Dependencies>
 __host__
 unique_eager_event make_dependent_event(std::tuple<Dependencies...>&& deps)
 {
-  int device = 0;
-  thrust::cuda_cub::throw_on_error(cudaGetDevice(&device));
+  int device_id = 0;
+  thrust::cuda_cub::throw_on_error(cudaGetDevice(&device_id));
 
   // First, either steal a stream from one of our children or make a new one.
-  auto as = acquire_stream(device, deps);
+  auto as = acquire_stream(device_id, deps);
 
   // Then, make the stream we've acquired asynchronously wait on all of our
   // dependencies, except the one we stole the stream from.
@@ -1290,7 +1294,7 @@ unique_eager_event make_dependent_event(std::tuple<Dependencies...>&& deps)
   );
 
   // Finally, we create the event object.
-  return unique_eager_event(device, std::move(sig));
+  return unique_eager_event(device_id, std::move(sig));
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1303,11 +1307,11 @@ __host__
 unique_eager_future_promise_pair<X, XPointer>
 make_dependent_future(ComputeContent&& cc, std::tuple<Dependencies...>&& deps)
 {
-  int device = 0;
-  thrust::cuda_cub::throw_on_error(cudaGetDevice(&device));
+  int device_id = 0;
+  thrust::cuda_cub::throw_on_error(cudaGetDevice(&device_id));
 
   // First, either steal a stream from one of our children or make a new one.
-  auto as = acquire_stream(device, deps);
+  auto as = acquire_stream(device_id, deps);
 
   // Then, make the stream we've acquired asynchronously wait on all of our
   // dependencies, except the one we stole the stream from.
@@ -1329,8 +1333,8 @@ make_dependent_future(ComputeContent&& cc, std::tuple<Dependencies...>&& deps)
   );
  
   // Finally, we create the promise and future objects.
-  weak_promise<X, XPointer> child_prom(device, sig->data());
-  unique_eager_future<X> child_fut(device, std::move(sig));
+  weak_promise<X, XPointer> child_prom(device_id, sig->data());
+  unique_eager_future<X> child_fut(device_id, std::move(sig));
 
   return unique_eager_future_promise_pair<X, XPointer>
     {std::move(child_fut), std::move(child_prom)};
@@ -1362,7 +1366,7 @@ THRUST_DECLTYPE_RETURNS(std::move(dependency))
 
 }} // namespace system::cuda
 
-THRUST_END_NS
+THRUST_NAMESPACE_END
 
-#endif 
+#endif // C++14
 
diff --git a/thrust/system/cuda/detail/gather.h b/thrust/system/cuda/detail/gather.h
index e153a857a..56ff3aecf 100644
--- a/thrust/system/cuda/detail/gather.h
+++ b/thrust/system/cuda/detail/gather.h
@@ -26,12 +26,13 @@
  ******************************************************************************/
 #pragma once
 
+#include <thrust/detail/config.h>
 
 #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
 #include <thrust/system/cuda/detail/transform.h>
 #include <thrust/iterator/permutation_iterator.h>
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
 namespace cuda_cub {
 
 template <class Derived,
@@ -101,6 +102,6 @@ gather_if(execution_policy<Derived>& policy,
 
 
 } // namespace cuda_cub
-THRUST_END_NS
+THRUST_NAMESPACE_END
 
 #endif
diff --git a/thrust/system/cuda/detail/generate.h b/thrust/system/cuda/detail/generate.h
index e1058c873..ad6340f83 100644
--- a/thrust/system/cuda/detail/generate.h
+++ b/thrust/system/cuda/detail/generate.h
@@ -26,6 +26,7 @@
  ******************************************************************************/
 #pragma once
 
+#include <thrust/detail/config.h>
 
 #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
 #include <iterator>
@@ -34,7 +35,7 @@
 #include <thrust/system/cuda/detail/for_each.h>
 #include <thrust/distance.h>
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
 namespace cuda_cub {
 
 // for_each functor
@@ -85,5 +86,5 @@ generate(execution_policy<Derived> &policy,
 }
 
 }    // namespace cuda_cub
-THRUST_END_NS
+THRUST_NAMESPACE_END
 #endif
diff --git a/thrust/system/cuda/detail/get_value.h b/thrust/system/cuda/detail/get_value.h
index 68b987dde..9065f773a 100644
--- a/thrust/system/cuda/detail/get_value.h
+++ b/thrust/system/cuda/detail/get_value.h
@@ -16,14 +16,17 @@
 
 #pragma once
 
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
 #include <thrust/detail/config.h>
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
 #include <thrust/system/cuda/config.h>
 #include <thrust/system/cuda/detail/cross_system.h>
 #include <thrust/detail/raw_pointer_cast.h>
 #include <thrust/iterator/iterator_traits.h>
 
-THRUST_BEGIN_NS
+#include <nv/target>
+
+THRUST_NAMESPACE_BEGIN
 namespace cuda_cub {
 
 
@@ -61,14 +64,10 @@ inline __host__ __device__
     }
   };
 
-#ifndef __CUDA_ARCH__
-  return war_nvbugs_881631::host_path(exec, ptr);
-#else
-  return war_nvbugs_881631::device_path(exec, ptr);
-#endif // __CUDA_ARCH__
+  NV_IF_TARGET(NV_IS_HOST,
+               (return war_nvbugs_881631::host_path(exec, ptr);),
+               (return war_nvbugs_881631::device_path(exec, ptr);))
 } // end get_value_msvc2005_war()
-
-
 } // end anon namespace
 
 
@@ -82,6 +81,6 @@ inline __host__ __device__
 
 
 } // end cuda_cub
-THRUST_END_NS
+THRUST_NAMESPACE_END
 
 #endif
diff --git a/thrust/system/cuda/detail/inner_product.h b/thrust/system/cuda/detail/inner_product.h
index 4e1cd5a4c..98e9064d2 100644
--- a/thrust/system/cuda/detail/inner_product.h
+++ b/thrust/system/cuda/detail/inner_product.h
@@ -26,6 +26,7 @@
  ******************************************************************************/
 #pragma once
 
+#include <thrust/detail/config.h>
 
 #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
 #include <iterator>
@@ -33,7 +34,7 @@
 #include <thrust/detail/minmax.h>
 #include <thrust/distance.h>
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
 
 namespace cuda_cub {
 
@@ -89,5 +90,5 @@ inner_product(execution_policy<Derived> &policy,
 
 }    // namespace cuda_cub
 
-THRUST_END_NS
+THRUST_NAMESPACE_END
 #endif
diff --git a/thrust/system/cuda/detail/internal/copy_cross_system.h b/thrust/system/cuda/detail/internal/copy_cross_system.h
index fcdd51f51..a1208c67c 100644
--- a/thrust/system/cuda/detail/internal/copy_cross_system.h
+++ b/thrust/system/cuda/detail/internal/copy_cross_system.h
@@ -30,6 +30,8 @@
 // this file must not be included on its own, ever,
 // but must be part of include in thrust/system/cuda/detail/copy.h
 
+#include <thrust/detail/config.h>
+
 #include <thrust/system/cuda/config.h>
 
 #include <thrust/distance.h>
@@ -40,7 +42,7 @@
 #include <thrust/detail/temporary_array.h>
 #include <thrust/type_traits/is_trivially_relocatable.h>
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
 namespace cuda_cub {
 
 namespace __copy {
@@ -99,12 +101,13 @@ namespace __copy {
 
   {
     typedef typename iterator_traits<InputIt>::value_type InputTy;
-
-    trivial_device_copy(derived_cast(sys1),
-                        derived_cast(sys2),
-                        reinterpret_cast<InputTy*>(thrust::raw_pointer_cast(&*result)),
-                        reinterpret_cast<InputTy const*>(thrust::raw_pointer_cast(&*begin)),
-                        n);
+    if (n > 0) {
+      trivial_device_copy(derived_cast(sys1),
+                          derived_cast(sys2),
+                          reinterpret_cast<InputTy*>(thrust::raw_pointer_cast(&*result)),
+                          reinterpret_cast<InputTy const*>(thrust::raw_pointer_cast(&*begin)),
+                          n);
+    }
 
     return result + n;
   }
@@ -238,4 +241,4 @@ namespace __copy {
 }    // namespace __copy
 
 } // namespace cuda_cub
-THRUST_END_NS
+THRUST_NAMESPACE_END
diff --git a/thrust/system/cuda/detail/internal/copy_device_to_device.h b/thrust/system/cuda/detail/internal/copy_device_to_device.h
index eb4769904..69c4e20df 100644
--- a/thrust/system/cuda/detail/internal/copy_device_to_device.h
+++ b/thrust/system/cuda/detail/internal/copy_device_to_device.h
@@ -27,6 +27,7 @@
  ******************************************************************************/
 #pragma once
 
+#include <thrust/detail/config.h>
 
 #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
 #include <thrust/system/cuda/config.h>
@@ -34,7 +35,7 @@
 #include <thrust/system/cuda/detail/transform.h>
 #include <thrust/functional.h>
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
 namespace cuda_cub {
 
 namespace __copy {
@@ -42,7 +43,7 @@ namespace __copy {
   template <class Derived,
             class InputIt,
             class OutputIt>
-  OutputIt CUB_RUNTIME_FUNCTION
+  OutputIt THRUST_RUNTIME_FUNCTION
   device_to_device(execution_policy<Derived>& policy,
                    InputIt                    first,
                    InputIt                    last,
@@ -59,5 +60,5 @@ namespace __copy {
 }    // namespace __copy
 
 }    // namespace cuda_cub
-THRUST_END_NS
+THRUST_NAMESPACE_END
 #endif
diff --git a/thrust/system/cuda/detail/iter_swap.h b/thrust/system/cuda/detail/iter_swap.h
index ec545b056..c0628610a 100644
--- a/thrust/system/cuda/detail/iter_swap.h
+++ b/thrust/system/cuda/detail/iter_swap.h
@@ -16,15 +16,19 @@
 
 #pragma once
 
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
 #include <thrust/detail/config.h>
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+
 #include <thrust/system/cuda/config.h>
 
 #include <thrust/detail/raw_pointer_cast.h>
 #include <thrust/system/cuda/detail/execution_policy.h>
 #include <thrust/swap.h>
 
-THRUST_BEGIN_NS
+#include <nv/target>
+
+THRUST_NAMESPACE_BEGIN
 namespace cuda_cub {
 
 
@@ -48,14 +52,15 @@ void iter_swap(thrust::cuda::execution_policy<DerivedPolicy> &, Pointer1 a, Poin
     }
   };
 
-#ifndef __CUDA_ARCH__
-  return war_nvbugs_881631::host_path(a, b);
-#else
-  return war_nvbugs_881631::device_path(a, b);
-#endif // __CUDA_ARCH__
+  NV_IF_TARGET(NV_IS_HOST, (
+    war_nvbugs_881631::host_path(a, b);
+  ), (
+    war_nvbugs_881631::device_path(a, b);
+  ));
+
 } // end iter_swap()
 
 
 } // end cuda_cub
-THRUST_END_NS
+THRUST_NAMESPACE_END
 #endif
diff --git a/thrust/system/cuda/detail/make_unsigned_special.h b/thrust/system/cuda/detail/make_unsigned_special.h
new file mode 100644
index 000000000..dda735767
--- /dev/null
+++ b/thrust/system/cuda/detail/make_unsigned_special.h
@@ -0,0 +1,43 @@
+/*
+ *  Copyright 2019 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+THRUST_NAMESPACE_BEGIN
+namespace cuda_cub {
+
+namespace detail {
+
+    template<typename Size>
+    struct make_unsigned_special;
+
+    template<>
+    struct make_unsigned_special<int> { typedef unsigned int type; };
+
+    // this is special, because CUDA's atomicAdd doesn't have an overload
+    // for unsigned long, for some godforsaken reason
+    template<>
+    struct make_unsigned_special<long> { typedef unsigned long long type; };
+
+    template<>
+    struct make_unsigned_special<long long> { typedef unsigned long long type; };
+
+}
+}
+THRUST_NAMESPACE_END
+
diff --git a/thrust/system/cuda/detail/malloc_and_free.h b/thrust/system/cuda/detail/malloc_and_free.h
index f4bff3659..1b12e2cc3 100644
--- a/thrust/system/cuda/detail/malloc_and_free.h
+++ b/thrust/system/cuda/detail/malloc_and_free.h
@@ -19,17 +19,21 @@
 #include <thrust/system/cuda/detail/guarded_cuda_runtime_api.h>
 
 #include <thrust/detail/config.h>
+#include <thrust/detail/raw_pointer_cast.h>
+#include <thrust/detail/raw_reference_cast.h>
 #include <thrust/detail/seq.h>
-#include <thrust/memory.h>
 #include <thrust/system/cuda/config.h>
-#ifdef THRUST_CACHING_DEVICE_MALLOC
-#include <thrust/system/cuda/detail/cub/util_allocator.cuh>
-#endif
 #include <thrust/system/cuda/detail/util.h>
 #include <thrust/system/detail/bad_alloc.h>
+#include <thrust/detail/malloc_and_free.h>
+
+#ifdef THRUST_CACHING_DEVICE_MALLOC
+#include <cub/util_allocator.cuh>
+#endif
 
+#include <nv/target>
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
 namespace cuda_cub {
 
 #ifdef THRUST_CACHING_DEVICE_MALLOC
@@ -52,21 +56,34 @@ void *malloc(execution_policy<DerivedPolicy> &, std::size_t n)
 {
   void *result = 0;
 
-#ifndef __CUDA_ARCH__
+  // need to repeat a lot of code here because we can't use #if inside of the
+  // NV_IF_TARGET macro.
+  // The device path is the same either way, but the host allocations differ.
 #ifdef __CUB_CACHING_MALLOC
-  cub::CachingDeviceAllocator &alloc = get_allocator();
-  cudaError_t status = alloc.DeviceAllocate(&result, n);
-#else
-  cudaError_t status = cudaMalloc(&result, n);
-#endif
+  NV_IF_TARGET(NV_IS_HOST, (
+    cub::CachingDeviceAllocator &alloc = get_allocator();
+    cudaError_t status = alloc.DeviceAllocate(&result, n);
+
+    if (status != cudaSuccess)
+    {
+      cudaGetLastError(); // Clear global CUDA error state.
+      throw thrust::system::detail::bad_alloc(thrust::cuda_category().message(status).c_str());
+    }
+  ), ( // NV_IS_DEVICE
+    result = thrust::raw_pointer_cast(thrust::malloc(thrust::seq, n));
+  ));
+#else // not __CUB_CACHING_MALLOC
+  NV_IF_TARGET(NV_IS_HOST, (
+    cudaError_t status = cudaMalloc(&result, n);
 
-  if(status != cudaSuccess)
-  {
-  //  cuda_cub::throw_on_error(status, "device malloc failed");
-    thrust::system::detail::bad_alloc(thrust::cuda_category().message(status).c_str());
-  } 
-#else
-  result = thrust::raw_pointer_cast(thrust::malloc(thrust::seq, n));
+    if (status != cudaSuccess)
+    {
+      cudaGetLastError(); // Clear global CUDA error state.
+      throw thrust::system::detail::bad_alloc(thrust::cuda_category().message(status).c_str());
+    }
+  ), ( // NV_IS_DEVICE
+    result = thrust::raw_pointer_cast(thrust::malloc(thrust::seq, n));
+  ));
 #endif
 
   return result;
@@ -77,18 +94,26 @@ template<typename DerivedPolicy, typename Pointer>
 __host__ __device__
 void free(execution_policy<DerivedPolicy> &, Pointer ptr)
 {
-#ifndef __CUDA_ARCH__
+  // need to repeat a lot of code here because we can't use #if inside of the
+  // NV_IF_TARGET macro.
+  // The device path is the same either way, but the host deallocations differ.
 #ifdef __CUB_CACHING_MALLOC
-  cub::CachingDeviceAllocator &alloc = get_allocator();
-  cudaError_t status = alloc.DeviceFree(thrust::raw_pointer_cast(ptr));
-#else
-  cudaError_t status = cudaFree(thrust::raw_pointer_cast(ptr));
-#endif
-  cuda_cub::throw_on_error(status, "device free failed");
-#else
-  thrust::free(thrust::seq, ptr);
+  NV_IF_TARGET(NV_IS_HOST, (
+    cub::CachingDeviceAllocator &alloc = get_allocator();
+    cudaError_t status = alloc.DeviceFree(thrust::raw_pointer_cast(ptr));
+    cuda_cub::throw_on_error(status, "device free failed");
+  ), ( // NV_IS_DEVICE
+    thrust::free(thrust::seq, ptr);
+  ));
+#else // not __CUB_CACHING_MALLOC
+  NV_IF_TARGET(NV_IS_HOST, (
+    cudaError_t status = cudaFree(thrust::raw_pointer_cast(ptr));
+    cuda_cub::throw_on_error(status, "device free failed");
+  ), ( // NV_IS_DEVICE
+    thrust::free(thrust::seq, ptr);
+  ));
 #endif
 } // end free()
 
 }    // namespace cuda_cub
-THRUST_END_NS
+THRUST_NAMESPACE_END
diff --git a/thrust/system/cuda/detail/memory.inl b/thrust/system/cuda/detail/memory.inl
index 82a04b67d..f6fc98359 100644
--- a/thrust/system/cuda/detail/memory.inl
+++ b/thrust/system/cuda/detail/memory.inl
@@ -21,8 +21,7 @@
 #include <thrust/system/cuda/detail/malloc_and_free.h>
 #include <limits>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace cuda_cub
 {
 
@@ -49,5 +48,5 @@ void free(pointer<void> ptr)
 } // end free()
 
 } // end cuda_cub
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/cuda/detail/merge.h b/thrust/system/cuda/detail/merge.h
index 04c93858c..478e3508d 100644
--- a/thrust/system/cuda/detail/merge.h
+++ b/thrust/system/cuda/detail/merge.h
@@ -26,24 +26,26 @@ j * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
  ******************************************************************************/
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+
 #include <thrust/detail/cstdint.h>
 #include <thrust/detail/temporary_array.h>
-#include <thrust/system/cuda/detail/util.h>
-
-#include <thrust/system/cuda/detail/execution_policy.h>
-#include <thrust/system/cuda/detail/util.h>
+#include <thrust/detail/mpl/math.h>
+#include <thrust/distance.h>
+#include <thrust/extrema.h>
+#include <thrust/merge.h>
+#include <thrust/pair.h>
+#include <thrust/system/cuda/detail/cdp_dispatch.h>
 #include <thrust/system/cuda/detail/core/agent_launcher.h>
 #include <thrust/system/cuda/detail/core/util.h>
+#include <thrust/system/cuda/detail/execution_policy.h>
+#include <thrust/system/cuda/detail/util.h>
 #include <thrust/system/cuda/detail/par_to_seq.h>
-#include <thrust/merge.h>
-#include <thrust/extrema.h>
-#include <thrust/pair.h>
-#include <thrust/detail/mpl/math.h>
-#include <thrust/distance.h>
 
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
 namespace cuda_cub {
 
 namespace __merge {
@@ -85,7 +87,7 @@ namespace __merge {
   }
 
   template <class It, class T2, class CompareOp, int ITEMS_PER_THREAD>
-  THRUST_DEVICE_FUNCTION void 
+  THRUST_DEVICE_FUNCTION void
   serial_merge(It  keys_shared,
                int keys1_beg,
                int keys2_beg,
@@ -97,7 +99,7 @@ namespace __merge {
   {
     int keys1_end = keys1_beg + keys1_count;
     int keys2_end = keys2_beg + keys2_count;
-    
+
     typedef typename iterator_value<It>::type key_type;
 
     key_type key1 = keys_shared[keys1_beg];
@@ -129,15 +131,13 @@ namespace __merge {
             int                      _ITEMS_PER_THREAD = 1,
             cub::BlockLoadAlgorithm  _LOAD_ALGORITHM   = cub::BLOCK_LOAD_DIRECT,
             cub::CacheLoadModifier   _LOAD_MODIFIER    = cub::LOAD_LDG,
-            cub::BlockStoreAlgorithm _STORE_ALGORITHM  = cub::BLOCK_STORE_DIRECT,
-            int                      _MIN_BLOCKS       = 1>
+            cub::BlockStoreAlgorithm _STORE_ALGORITHM  = cub::BLOCK_STORE_DIRECT>
   struct PtxPolicy
   {
     enum
     {
       BLOCK_THREADS      = _BLOCK_THREADS,
       ITEMS_PER_THREAD   = _ITEMS_PER_THREAD,
-      MIN_BLOCKS         = _MIN_BLOCKS,
       ITEMS_PER_TILE     = _BLOCK_THREADS * _ITEMS_PER_THREAD,
     };
 
@@ -170,7 +170,7 @@ namespace __merge {
       Size partition_idx = blockDim.x * blockIdx.x + threadIdx.x;
       if (partition_idx < num_partitions)
       {
-        Size partition_at = thrust::min(partition_idx * items_per_tile,
+        Size partition_at = (thrust::min)(partition_idx * items_per_tile,
                                         keys1_count + keys2_count);
         Size partition_diag = merge_path(keys1,
                                          keys2,
@@ -186,10 +186,10 @@ namespace __merge {
 
   template <class Arch, class TSize>
   struct Tuning;
-  
+
   namespace mpl = thrust::detail::mpl::math;
 
-  template<size_t NOMINAL_4B_ITEMS_PER_THREAD, size_t INPUT_SIZE>
+  template<int NOMINAL_4B_ITEMS_PER_THREAD, size_t INPUT_SIZE>
   struct items_per_thread
   {
     enum
@@ -201,13 +201,13 @@ namespace __merge {
               mpl::max<
                   int,
                   1,
-                  (NOMINAL_4B_ITEMS_PER_THREAD * 4 / INPUT_SIZE)>::value>::value,
-      value = mpl::is_odd<size_t, ITEMS_PER_THREAD>::value
+                  static_cast<int>(NOMINAL_4B_ITEMS_PER_THREAD * 4 / INPUT_SIZE)>::value>::value,
+      value = mpl::is_odd<int, ITEMS_PER_THREAD>::value
                   ? ITEMS_PER_THREAD
                   : ITEMS_PER_THREAD + 1
     };
   };
-  
+
   template<class TSize>
   struct Tuning<sm30,TSize>
   {
@@ -226,9 +226,9 @@ namespace __merge {
                       cub::BLOCK_STORE_WARP_TRANSPOSE>
         type;
   };    // Tuning sm300
-  
 
-  
+
+
   template<class TSize>
   struct Tuning<sm60,TSize> : Tuning<sm30,TSize>
   {
@@ -265,7 +265,7 @@ namespace __merge {
                       cub::BLOCK_STORE_WARP_TRANSPOSE>
         type;
   };    // Tuning sm52
-  
+
   template<class TSize>
   struct Tuning<sm35,TSize> : Tuning<sm30,TSize>
   {
@@ -286,7 +286,7 @@ namespace __merge {
         type;
   };    // Tuning sm350
 
- 
+
   template<size_t VALUE>
   struct integer_constant : thrust::detail::integral_constant<size_t, VALUE> {};
 
@@ -447,7 +447,7 @@ namespace __merge {
       }
 
       //---------------------------------------------------------------------
-      // Tile processing 
+      // Tile processing
       //---------------------------------------------------------------------
 
       template <bool IS_FULL_TILE>
@@ -463,7 +463,7 @@ namespace __merge {
         Size partition_end = merge_partitions[tile_idx + 1];
 
         Size diag0 = ITEMS_PER_TILE * tile_idx;
-        Size diag1 = thrust::min(keys1_count + keys2_count, diag0 + ITEMS_PER_TILE);
+        Size diag1 = (thrust::min)(keys1_count + keys2_count, diag0 + ITEMS_PER_TILE);
 
         // compute bounding box for keys1 & keys2
         //
@@ -576,9 +576,9 @@ namespace __merge {
           }
         }
       }
-      
+
       //---------------------------------------------------------------------
-      // Constructor 
+      // Constructor
       //---------------------------------------------------------------------
 
       THRUST_DEVICE_FUNCTION
@@ -661,7 +661,7 @@ namespace __merge {
   };    // struct MergeAgent;
 
   //---------------------------------------------------------------------
-  // Two-step internal API 
+  // Two-step internal API
   //---------------------------------------------------------------------
 
   template <class MERGE_ITEMS,
@@ -673,7 +673,7 @@ namespace __merge {
             class KeysOutputIt,
             class ItemsOutputIt,
             class CompareOp>
-  cudaError_t CUB_RUNTIME_FUNCTION
+  cudaError_t THRUST_RUNTIME_FUNCTION
   doit_step(void*         d_temp_storage,
             size_t&       temp_storage_bytes,
             KeysIt1       keys1,
@@ -685,8 +685,7 @@ namespace __merge {
             KeysOutputIt  keys_result,
             ItemsOutputIt items_result,
             CompareOp     compare_op,
-            cudaStream_t  stream,
-            bool          debug_sync)
+            cudaStream_t  stream)
   {
     if (num_keys1 + num_keys2 == 0)
       return cudaErrorNotSupported;
@@ -745,7 +744,7 @@ namespace __merge {
     {
       Size num_partitions = num_tiles + 1;
 
-      partition_agent(partition_plan, num_partitions, stream, "partition agent", debug_sync)
+      partition_agent(partition_plan, num_partitions, stream, "partition agent")
           .launch(keys1,
                   keys2,
                   num_keys1,
@@ -757,7 +756,7 @@ namespace __merge {
       CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
     }
 
-    merge_agent(merge_plan, num_keys1 + num_keys2, stream, vshmem_ptr, "merge agent", debug_sync)
+    merge_agent(merge_plan, num_keys1 + num_keys2, stream, vshmem_ptr, "merge agent")
         .launch(keys1,
                 keys2,
                 items1,
@@ -809,8 +808,7 @@ namespace __merge {
 
     size_t       storage_size = 0;
     cudaStream_t stream       = cuda_cub::stream(policy);
-    bool         debug_sync   = THRUST_DEBUG_SYNC_FLAG;
-    
+
     cudaError_t status;
     status = doit_step<MERGE_ITEMS>(NULL,
                                     storage_size,
@@ -823,8 +821,7 @@ namespace __merge {
                                     keys_result,
                                     items_result,
                                     compare_op,
-                                    stream,
-                                    debug_sync);
+                                    stream);
     cuda_cub::throw_on_error(status, "merge: failed on 1st step");
 
     // Allocate temporary storage.
@@ -843,11 +840,10 @@ namespace __merge {
                                     keys_result,
                                     items_result,
                                     compare_op,
-                                    stream,
-                                    debug_sync);
+                                    stream);
     cuda_cub::throw_on_error(status, "merge: failed on 2nd step");
-    
-    status = cuda_cub::synchronize(policy);
+
+    status = cuda_cub::synchronize_optional(policy);
     cuda_cub::throw_on_error(status, "merge: failed to synchronize");
 
     return thrust::make_pair(keys_result + count, items_result + count);
@@ -876,38 +872,28 @@ merge(execution_policy<Derived>& policy,
       CompareOp                  compare_op)
 
 {
-  ResultIt ret = result;
-  if (__THRUST_HAS_CUDART__)
-  {
-    typedef typename thrust::iterator_value<KeysIt1>::type keys_type;
-    //
-    keys_type* null_ = NULL;
-    //
-    ret = __merge::merge<thrust::detail::false_type>(policy,
-                                                     keys1_first,
-                                                     keys1_last,
-                                                     keys2_first,
-                                                     keys2_last,
-                                                     null_,
-                                                     null_,
-                                                     result,
-                                                     null_,
-                                                     compare_op)
-              .first;
-  }
-  else
-  {
-#if !__THRUST_HAS_CUDART__
-    ret = thrust::merge(cvt_to_seq(derived_cast(policy)),
-                        keys1_first,
-                        keys1_last,
-                        keys2_first,
-                        keys2_last,
-                        result,
-                        compare_op);
-#endif
-  }
-  return ret;
+  THRUST_CDP_DISPATCH((using keys_type  = thrust::iterator_value_t<KeysIt1>;
+                       keys_type *null_ = nullptr;
+                       auto tmp =
+                         __merge::merge<thrust::detail::false_type>(policy,
+                                                                    keys1_first,
+                                                                    keys1_last,
+                                                                    keys2_first,
+                                                                    keys2_last,
+                                                                    null_,
+                                                                    null_,
+                                                                    result,
+                                                                    null_,
+                                                                    compare_op);
+                       result = tmp.first;),
+                      (result = thrust::merge(cvt_to_seq(derived_cast(policy)),
+                                              keys1_first,
+                                              keys1_last,
+                                              keys2_first,
+                                              keys2_last,
+                                              result,
+                                              compare_op);));
+  return result;
 }
 
 template <class Derived, class KeysIt1, class KeysIt2, class ResultIt>
@@ -950,10 +936,9 @@ merge_by_key(execution_policy<Derived> &policy,
              ItemsOutputIt              items_result,
              CompareOp                  compare_op)
 {
-  pair<KeysOutputIt, ItemsOutputIt> ret = thrust::make_pair(keys_result, items_result);
-  if (__THRUST_HAS_CUDART__)
-  {
-    return __merge::merge<thrust::detail::true_type>(policy,
+  auto ret = thrust::make_pair(keys_result, items_result);
+  THRUST_CDP_DISPATCH(
+    (ret = __merge::merge<thrust::detail::true_type>(policy,
                                                      keys1_first,
                                                      keys1_last,
                                                      keys2_first,
@@ -962,23 +947,17 @@ merge_by_key(execution_policy<Derived> &policy,
                                                      items2_first,
                                                      keys_result,
                                                      items_result,
-                                                     compare_op);
-  }
-  else
-  {
-#if !__THRUST_HAS_CUDART__
-    ret = thrust::merge_by_key(cvt_to_seq(derived_cast(policy)),
-                               keys1_first,
-                               keys1_last,
-                               keys2_first,
-                               keys2_last,
-                               items1_first,
-                               items2_first,
-                               keys_result,
-                               items_result,
-                               compare_op);
-#endif
-  }
+                                                     compare_op);),
+    (ret = thrust::merge_by_key(cvt_to_seq(derived_cast(policy)),
+                                keys1_first,
+                                keys1_last,
+                                keys2_first,
+                                keys2_last,
+                                items1_first,
+                                items2_first,
+                                keys_result,
+                                items_result,
+                                compare_op);));
   return ret;
 }
 
@@ -1015,5 +994,5 @@ merge_by_key(execution_policy<Derived> &policy,
 
 
 }    // namespace cuda_cub
-THRUST_END_NS
+THRUST_NAMESPACE_END
 #endif
diff --git a/thrust/system/cuda/detail/mismatch.h b/thrust/system/cuda/detail/mismatch.h
index 845c93723..b1e2f44d2 100644
--- a/thrust/system/cuda/detail/mismatch.h
+++ b/thrust/system/cuda/detail/mismatch.h
@@ -26,6 +26,7 @@
  ******************************************************************************/
 #pragma once
 
+#include <thrust/detail/config.h>
 
 #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
 #include <thrust/system/cuda/config.h>
@@ -33,7 +34,7 @@
 #include <thrust/pair.h>
 #include <thrust/distance.h>
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
 namespace cuda_cub {
 
 template <class Derived,
@@ -56,11 +57,11 @@ mismatch(execution_policy<Derived>& policy,
          InputIt1                   last1,
          InputIt2                   first2);
 } // namespace cuda_
-THRUST_END_NS
+THRUST_NAMESPACE_END
 
 #include <thrust/system/cuda/detail/find.h>
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
 namespace cuda_cub {
 
 template <class Derived,
@@ -87,8 +88,8 @@ mismatch(execution_policy<Derived>& policy,
                                           transform_first + thrust::distance(first1, last1),
                                           identity());
 
-  return make_pair(first1 + thrust::distance(transform_first,result),
-                   first2 + thrust::distance(transform_first,result));
+  return thrust::make_pair(first1 + thrust::distance(transform_first,result),
+                           first2 + thrust::distance(transform_first,result));
 }
 
 template <class Derived,
@@ -111,5 +112,5 @@ mismatch(execution_policy<Derived>& policy,
 
 
 } // namespace cuda_cub
-THRUST_END_NS
+THRUST_NAMESPACE_END
 #endif
diff --git a/thrust/system/cuda/detail/par.h b/thrust/system/cuda/detail/par.h
index 0a4e3ac5c..42c701ca7 100644
--- a/thrust/system/cuda/detail/par.h
+++ b/thrust/system/cuda/detail/par.h
@@ -29,6 +29,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/cuda/detail/guarded_cuda_runtime_api.h>
 #include <thrust/system/cuda/detail/execution_policy.h>
+#include <thrust/system/cuda/detail/util.h>
 
 #include <thrust/detail/allocator_aware_execution_policy.h>
 
@@ -37,49 +38,49 @@
 #endif
 
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
 namespace cuda_cub {
 
-inline __host__ __device__
-cudaStream_t
-default_stream()
-{
-  return cudaStreamLegacy;
-}
-
 template <class Derived>
-__host__ __device__
-cudaStream_t
-get_stream(execution_policy<Derived> &)
+struct execute_on_stream_base : execution_policy<Derived>
 {
-  return default_stream();
-}
+private:
+  cudaStream_t stream;
 
-__thrust_exec_check_disable__
-template <class Derived>
-__host__ __device__
-cudaError_t
-synchronize_stream(execution_policy<Derived> &)
-{
-  #if __THRUST_HAS_CUDART__
-    cudaDeviceSynchronize();
-    return cudaGetLastError();
-  #else
-    return cudaSuccess;
-  #endif
-}
+public:
+  __thrust_exec_check_disable__
+  __host__ __device__
+  execute_on_stream_base(cudaStream_t stream_ = default_stream())
+      : stream(stream_){}
+
+  THRUST_RUNTIME_FUNCTION
+  Derived
+  on(cudaStream_t const &s) const
+  {
+    Derived result = derived_cast(*this);
+    result.stream  = s;
+    return result;
+  }
 
+private:
+  friend __host__ __device__
+  cudaStream_t
+  get_stream(const execute_on_stream_base &exec)
+  {
+    return exec.stream;
+  }
+};
 
 template <class Derived>
-struct execute_on_stream_base : execution_policy<Derived>
+struct execute_on_stream_nosync_base : execution_policy<Derived>
 {
 private:
   cudaStream_t stream;
 
 public:
   __host__ __device__
-  execute_on_stream_base(cudaStream_t stream_ = default_stream())
-      : stream(stream_) {}
+  execute_on_stream_nosync_base(cudaStream_t stream_ = default_stream())
+      : stream(stream_){}
 
   THRUST_RUNTIME_FUNCTION
   Derived
@@ -93,26 +94,16 @@ struct execute_on_stream_base : execution_policy<Derived>
 private:
   friend __host__ __device__
   cudaStream_t
-  get_stream(const execute_on_stream_base &exec)
+  get_stream(const execute_on_stream_nosync_base &exec)
   {
     return exec.stream;
   }
 
   friend __host__ __device__
-  cudaError_t
-  synchronize_stream(execute_on_stream_base &exec)
+  bool
+  must_perform_optional_stream_synchronization(const execute_on_stream_nosync_base &)
   {
-    #if   !__CUDA_ARCH__
-      cudaStreamSynchronize(exec.stream);
-      return cudaGetLastError();
-    #elif __THRUST_HAS_CUDART__
-      THRUST_UNUSED_VAR(exec);
-      cudaDeviceSynchronize();
-      return cudaGetLastError();
-    #else
-      THRUST_UNUSED_VAR(exec);
-      return cudaSuccess;
-    #endif
+    return false;
   }
 };
 
@@ -123,7 +114,19 @@ struct execute_on_stream : execute_on_stream_base<execute_on_stream>
   __host__ __device__
   execute_on_stream() : base_t(){};
   __host__ __device__
-  execute_on_stream(cudaStream_t stream) : base_t(stream){};
+  execute_on_stream(cudaStream_t stream) 
+  : base_t(stream){};
+};
+
+struct execute_on_stream_nosync : execute_on_stream_nosync_base<execute_on_stream_nosync>
+{
+  typedef execute_on_stream_nosync_base<execute_on_stream_nosync> base_t;
+
+  __host__ __device__
+  execute_on_stream_nosync() : base_t(){};
+  __host__ __device__
+  execute_on_stream_nosync(cudaStream_t stream) 
+  : base_t(stream){};
 };
 
 
@@ -138,7 +141,7 @@ struct par_t : execution_policy<par_t>,
   typedef execution_policy<par_t> base_t;
 
   __host__ __device__
-  par_t() : base_t() {}
+  constexpr par_t() : base_t() {}
 
   typedef execute_on_stream stream_attachment_type;
 
@@ -150,25 +153,106 @@ struct par_t : execution_policy<par_t>,
   }
 };
 
-#ifdef __CUDA_ARCH__
-static const __device__ par_t par;
-#else
-static const par_t par;
+struct par_nosync_t : execution_policy<par_nosync_t>,
+  thrust::detail::allocator_aware_execution_policy<
+    execute_on_stream_nosync_base>
+#if THRUST_CPP_DIALECT >= 2011
+, thrust::detail::dependencies_aware_execution_policy<
+    execute_on_stream_nosync_base>
 #endif
+{
+  typedef execution_policy<par_nosync_t> base_t;
+
+  __host__ __device__
+  constexpr par_nosync_t() : base_t() {}
+
+  typedef execute_on_stream_nosync stream_attachment_type;
+
+  THRUST_RUNTIME_FUNCTION
+  stream_attachment_type
+  on(cudaStream_t const &stream) const
+  {
+    return execute_on_stream_nosync(stream);
+  }
+
+private:
+  //this function is defined to allow non-blocking calls on the default_stream() with thrust::cuda::par_nosync
+  //without explicitly using thrust::cuda::par_nosync.on(default_stream())
+  friend __host__ __device__
+  bool
+  must_perform_optional_stream_synchronization(const par_nosync_t &)
+  {
+    return false;
+  }
+};
+
+THRUST_INLINE_CONSTANT par_t par;
+
+/*! \p thrust::cuda::par_nosync is a parallel execution policy targeting Thrust's CUDA device backend.
+ *  Similar to \p thrust::cuda::par it allows execution of Thrust algorithms in a specific CUDA stream.
+ *
+ *  \p thrust::cuda::par_nosync indicates that an algorithm is free to avoid any synchronization of the 
+ *  associated stream that is not strictly required for correctness. Additionally, algorithms may return
+ *  before the corresponding kernels are completed, similar to asynchronous kernel launches via <<< >>> syntax.
+ *  The user must take care to perform explicit synchronization if necessary.
+ *  
+ *  The following code snippet demonstrates how to use \p thrust::cuda::par_nosync :
+ *
+ *  \code
+ *    #include <thrust/device_vector.h>
+ *    #include <thrust/for_each.h>
+ *    #include <thrust/execution_policy.h>
+ *
+ *    struct IncFunctor{
+ *        __host__ __device__
+ *        void operator()(std::size_t& x){ x = x + 1; };
+ *    };
+ *
+ *    int main(){
+ *        std::size_t N = 1000000;
+ *        thrust::device_vector<std::size_t> d_vec(N);
+ *
+ *        cudaStream_t stream;
+ *        cudaStreamCreate(&stream);
+ *        auto nosync_policy = thrust::cuda::par_nosync.on(stream);
+ *
+ *        thrust::for_each(nosync_policy, d_vec.begin(), d_vec.end(), IncFunctor{});
+ *        thrust::for_each(nosync_policy, d_vec.begin(), d_vec.end(), IncFunctor{});
+ *        thrust::for_each(nosync_policy, d_vec.begin(), d_vec.end(), IncFunctor{});
+ *
+ *        //for_each may return before completion. Could do other cpu work in the meantime
+ *        // ...
+ *
+ *        //Wait for the completion of all for_each kernels
+ *        cudaStreamSynchronize(stream);
+ *
+ *        std::size_t x = thrust::reduce(nosync_policy, d_vec.begin(), d_vec.end());
+ *        //Currently, this synchronization is not necessary. reduce will still perform
+ *        //implicit synchronization to transfer the reduced value to the host to return it.
+ *        cudaStreamSynchronize(stream);
+ *        cudaStreamDestroy(stream);
+ *    }
+ *  \endcode
+ *
+ */
+THRUST_INLINE_CONSTANT par_nosync_t par_nosync;
 }    // namespace cuda_
 
 namespace system {
 namespace cuda {
   using thrust::cuda_cub::par;
+  using thrust::cuda_cub::par_nosync;
   namespace detail {
     using thrust::cuda_cub::par_t;
+    using thrust::cuda_cub::par_nosync_t;
   }
 } // namesapce cuda
 } // namespace system
 
 namespace cuda {
 using thrust::cuda_cub::par;
+using thrust::cuda_cub::par_nosync;
 } // namespace cuda
 
-THRUST_END_NS
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/cuda/detail/par_to_seq.h b/thrust/system/cuda/detail/par_to_seq.h
index f1610b288..e710f017b 100644
--- a/thrust/system/cuda/detail/par_to_seq.h
+++ b/thrust/system/cuda/detail/par_to_seq.h
@@ -26,10 +26,12 @@
  ******************************************************************************/
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/detail/seq.h>
 #include <thrust/system/cuda/detail/par.h>
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
 namespace cuda_cub {
 
 template <int PAR>
@@ -80,11 +82,5 @@ cvt_to_seq(Policy& policy)
   return cvt_to_seq_impl<Policy>::doit(policy);
 }
 
-#if __THRUST_HAS_CUDART__
-#define THRUST_CUDART_DISPATCH par
-#else
-#define THRUST_CUDART_DISPATCH seq
-#endif
-
 } // namespace cuda_
-THRUST_END_NS
+THRUST_NAMESPACE_END
diff --git a/thrust/system/cuda/detail/parallel_for.h b/thrust/system/cuda/detail/parallel_for.h
index 302c90620..43c3297aa 100644
--- a/thrust/system/cuda/detail/parallel_for.h
+++ b/thrust/system/cuda/detail/parallel_for.h
@@ -26,25 +26,25 @@
  ******************************************************************************/
 #pragma once
 
+#include <thrust/detail/config.h>
 
 #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
-#include <thrust/system/cuda/config.h>
 
-#include <thrust/system/cuda/detail/util.h>
 #include <thrust/detail/type_traits/result_of_adaptable_function.h>
-#include <thrust/system/cuda/detail/par_to_seq.h>
+#include <thrust/system/cuda/config.h>
+#include <thrust/system/cuda/detail/cdp_dispatch.h>
 #include <thrust/system/cuda/detail/core/agent_launcher.h>
 #include <thrust/system/cuda/detail/par_to_seq.h>
+#include <thrust/system/cuda/detail/util.h>
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
 
 namespace cuda_cub {
 
 namespace __parallel_for {
 
   template <int _BLOCK_THREADS,
-            int _ITEMS_PER_THREAD = 1,
-            int _MIN_BLOCKS       = 1>
+            int _ITEMS_PER_THREAD = 1>
   struct PtxPolicy
   {
     enum
@@ -52,7 +52,6 @@ namespace __parallel_for {
       BLOCK_THREADS    = _BLOCK_THREADS,
       ITEMS_PER_THREAD = _ITEMS_PER_THREAD,
       ITEMS_PER_TILE   = BLOCK_THREADS * ITEMS_PER_THREAD,
-      MIN_BLOCKS       = _MIN_BLOCKS
     };
   };    // struct PtxPolicy
 
@@ -133,12 +132,10 @@ namespace __parallel_for {
     using core::AgentLauncher;
     using core::AgentPlan;
 
-    bool debug_sync = THRUST_DEBUG_SYNC_FLAG;
-
     typedef AgentLauncher<ParallelForAgent<F, Size> > parallel_for_agent;
     AgentPlan parallel_for_plan = parallel_for_agent::get_plan(stream);
 
-    parallel_for_agent pfa(parallel_for_plan, num_items, stream, "transform::agent", debug_sync);
+    parallel_for_agent pfa(parallel_for_plan, num_items, stream, "transform::agent");
     pfa.launch(f, num_items);
     CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
 
@@ -146,7 +143,7 @@ namespace __parallel_for {
   }
 }    // __parallel_for
 
-__thrust_exec_check_disable__ 
+__thrust_exec_check_disable__
 template <class Derived,
           class F,
           class Size>
@@ -156,24 +153,27 @@ parallel_for(execution_policy<Derived> &policy,
              Size                       count)
 {
   if (count == 0)
-    return;
-
-  if (__THRUST_HAS_CUDART__)
-  {
-    cudaStream_t stream = cuda_cub::stream(policy);
-    cudaError_t  status = __parallel_for::parallel_for(count, f, stream);
-    cuda_cub::throw_on_error(status, "parallel_for failed");
-  }
-  else
   {
-#if !__THRUST_HAS_CUDART__
-    for (Size idx = 0; idx != count; ++idx)
-      f(idx);
-#endif
+    return;
   }
+
+  // clang-format off
+  THRUST_CDP_DISPATCH(
+    (cudaStream_t stream = cuda_cub::stream(policy);
+     cudaError_t  status = __parallel_for::parallel_for(count, f, stream);
+     cuda_cub::throw_on_error(status, "parallel_for failed");
+     status = cuda_cub::synchronize_optional(policy);
+     cuda_cub::throw_on_error(status, "parallel_for: failed to synchronize");),
+    // CDP sequential impl:
+    (for (Size idx = 0; idx != count; ++idx)
+     {
+       f(idx);
+     }
+  ));
+  // clang-format on
 }
 
 }    // namespace cuda_cub
 
-THRUST_END_NS
+THRUST_NAMESPACE_END
 #endif
diff --git a/thrust/system/cuda/detail/partition.h b/thrust/system/cuda/detail/partition.h
index 24f667e2f..fad75eb0d 100644
--- a/thrust/system/cuda/detail/partition.h
+++ b/thrust/system/cuda/detail/partition.h
@@ -26,31 +26,37 @@
  ******************************************************************************/
 #pragma once
 
+#include <thrust/detail/config.h>
 
 #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
-#include <thrust/system/cuda/config.h>
 
 #include <thrust/detail/cstdint.h>
 #include <thrust/detail/temporary_array.h>
-#include <thrust/system/cuda/detail/util.h>
-#include <thrust/system/cuda/detail/reverse.h>
+#include <thrust/distance.h>
+#include <thrust/pair.h>
+#include <thrust/partition.h>
+#include <thrust/system/cuda/config.h>
+#include <thrust/system/cuda/detail/cdp_dispatch.h>
+#include <thrust/system/cuda/detail/core/agent_launcher.h>
 #include <thrust/system/cuda/detail/find.h>
+#include <thrust/system/cuda/detail/reverse.h>
 #include <thrust/system/cuda/detail/uninitialized_copy.h>
-#include <thrust/system/cuda/detail/cub/device/device_partition.cuh>
-#include <thrust/system/cuda/detail/core/agent_launcher.h>
 #include <thrust/system/cuda/detail/par_to_seq.h>
-#include <thrust/partition.h>
-#include <thrust/pair.h>
-#include <thrust/distance.h>
+#include <thrust/system/cuda/detail/util.h>
+
+#include <cub/agent/single_pass_scan_operators.cuh> // cub::ScanTileState
+#include <cub/block/block_scan.cuh>
+#include <cub/device/device_partition.cuh>
+#include <cub/util_device.cuh>
+#include <cub/util_math.cuh>
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
 namespace cuda_cub {
 
 namespace __partition {
 
   template <int                     _BLOCK_THREADS,
             int                     _ITEMS_PER_THREAD = 1,
-            int                     _MIN_BLOCKS       = 1,
             cub::BlockLoadAlgorithm _LOAD_ALGORITHM   = cub::BLOCK_LOAD_DIRECT,
             cub::CacheLoadModifier  _LOAD_MODIFIER    = cub::LOAD_LDG,
             cub::BlockScanAlgorithm _SCAN_ALGORITHM   = cub::BLOCK_SCAN_WARP_SCANS>
@@ -60,8 +66,7 @@ namespace __partition {
     {
       BLOCK_THREADS      = _BLOCK_THREADS,
       ITEMS_PER_THREAD   = _ITEMS_PER_THREAD,
-      MIN_BLOCKS         = _MIN_BLOCKS,
-      ITEMS_PER_TILE     = _BLOCK_THREADS * _ITEMS_PER_THREAD,
+      ITEMS_PER_TILE     = _BLOCK_THREADS * _ITEMS_PER_THREAD
     };
     static const cub::BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM;
     static const cub::CacheLoadModifier  LOAD_MODIFIER  = _LOAD_MODIFIER;
@@ -84,13 +89,12 @@ namespace __partition {
 
     typedef PtxPolicy<128,
                       ITEMS_PER_THREAD,
-                      1,
                       cub::BLOCK_LOAD_WARP_TRANSPOSE,
                       cub::LOAD_LDG,
                       cub::BLOCK_SCAN_WARP_SCANS>
         type;
   };    // Tuning<350>
-  
+
   template<class T>
   struct Tuning<sm30, T>
   {
@@ -104,19 +108,18 @@ namespace __partition {
 
     typedef PtxPolicy<128,
                       ITEMS_PER_THREAD,
-                      1,
                       cub::BLOCK_LOAD_WARP_TRANSPOSE,
                       cub::LOAD_DEFAULT,
                       cub::BLOCK_SCAN_WARP_SCANS>
         type;
   };    // Tuning<300>
-  
+
   template<int T>
   struct __tag{};
 
 
   struct no_stencil_tag_    {};
-  struct single_output_tag_ 
+  struct single_output_tag_
   {
     template<class T>
     THRUST_DEVICE_FUNCTION T const& operator=(T const& t) const { return t; }
@@ -167,11 +170,11 @@ namespace __partition {
 
       union TempStorage
       {
-        struct
+        struct ScanStorage
         {
           typename BlockScan::TempStorage          scan;
           typename TilePrefixCallback::TempStorage prefix;
-        };
+        } scan_storage;
 
         typename BlockLoadItems::TempStorage   load_items;
         typename BlockLoadStencil::TempStorage load_stencil;
@@ -358,7 +361,7 @@ namespace __partition {
       }
 
       //---------------------------------------------------------------------
-      // Tile processing 
+      // Tile processing
       //---------------------------------------------------------------------
 
       template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
@@ -418,7 +421,7 @@ namespace __partition {
         Size num_rejected_prefix   = 0;
         if (IS_FIRST_TILE)
         {
-          BlockScan(temp_storage.scan)
+          BlockScan(temp_storage.scan_storage.scan)
               .ExclusiveSum(selection_flags,
                             selection_idx,
                             num_tile_selections);
@@ -441,10 +444,10 @@ namespace __partition {
         else
         {
           TilePrefixCallback prefix_cb(tile_state,
-                                       temp_storage.prefix,
+                                       temp_storage.scan_storage.prefix,
                                        cub::Sum(),
                                        tile_idx);
-          BlockScan(temp_storage.scan)
+          BlockScan(temp_storage.scan_storage.scan)
               .ExclusiveSum(selection_flags,
                             selection_idx,
                             prefix_cb);
@@ -582,7 +585,7 @@ namespace __partition {
   {
     template <class Arch>
     struct PtxPlan : PtxPolicy<128> {};
-   
+
 
     typedef core::specialize_plan<PtxPlan> ptx_plan;
 
@@ -619,8 +622,7 @@ namespace __partition {
             Predicate        predicate,
             NumSelectedOutIt num_selected_out,
             Size             num_items,
-            cudaStream_t     stream,
-            bool             debug_sync)
+            cudaStream_t     stream)
   {
     using core::AgentLauncher;
     using core::AgentPlan;
@@ -648,7 +650,7 @@ namespace __partition {
     typename get_plan<partition_agent>::type partition_plan = partition_agent::get_plan(stream);
 
     int tile_size = partition_plan.items_per_tile;
-    size_t num_tiles = (num_items + tile_size - 1) / tile_size;
+    size_t num_tiles = cub::DivideAndRoundUp(num_items, tile_size);
 
     size_t vshmem_storage = core::vshmem_size(partition_plan.shared_memory_size,
                                               num_tiles);
@@ -660,15 +662,15 @@ namespace __partition {
     size_t allocation_sizes[2] = {0, vshmem_storage};
     status = ScanTileState::AllocationSize(static_cast<int>(num_tiles), allocation_sizes[0]);
     CUDA_CUB_RET_IF_FAIL(status);
-    
+
 
     void* allocations[2] = {NULL, NULL};
     status = cub::AliasTemporaries(d_temp_storage,
-                                   temp_storage_bytes,
-                                   allocations,
-                                   allocation_sizes);
+                                                temp_storage_bytes,
+                                                allocations,
+                                                allocation_sizes);
     CUDA_CUB_RET_IF_FAIL(status);
-    
+
     if (d_temp_storage == NULL)
     {
       return status;
@@ -678,11 +680,11 @@ namespace __partition {
     status = tile_status.Init(static_cast<int>(num_tiles), allocations[0], allocation_sizes[0]);
     CUDA_CUB_RET_IF_FAIL(status);
 
-    init_agent ia(init_plan, num_tiles, stream, "partition::init_agent", debug_sync);
+    init_agent ia(init_plan, num_tiles, stream, "partition::init_agent");
 
     char *vshmem_ptr = vshmem_storage > 0 ? (char *)allocations[1] : NULL;
 
-    partition_agent pa(partition_plan, num_items, stream, vshmem_ptr, "partition::partition_agent", debug_sync);
+    partition_agent pa(partition_plan, num_items, stream, vshmem_ptr, "partition::partition_agent");
 
     ia.launch(tile_status, num_tiles, num_selected_out);
     CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
@@ -722,7 +724,6 @@ namespace __partition {
     size_type    num_items          = static_cast<size_type>(thrust::distance(first, last));
     size_t       temp_storage_bytes = 0;
     cudaStream_t stream             = cuda_cub::stream(policy);
-    bool         debug_sync         = THRUST_DEBUG_SYNC_FLAG;
 
     cudaError_t status;
     status = doit_step(NULL,
@@ -734,8 +735,7 @@ namespace __partition {
                        predicate,
                        reinterpret_cast<size_type*>(NULL),
                        num_items,
-                       stream,
-                       debug_sync);
+                       stream);
     cuda_cub::throw_on_error(status, "partition failed on 1st step");
 
     size_t allocation_sizes[2] = {sizeof(size_type), temp_storage_bytes};
@@ -772,8 +772,7 @@ namespace __partition {
                        predicate,
                        d_num_selected_out,
                        num_items,
-                       stream,
-                       debug_sync);
+                       stream);
     cuda_cub::throw_on_error(status, "partition failed on 2nd step");
 
     status = cuda_cub::synchronize(policy);
@@ -831,7 +830,7 @@ namespace __partition {
 // Thrust API entry points
 //-------------------------
 
-__thrust_exec_check_disable__ 
+__thrust_exec_check_disable__
 template <class Derived,
           class InputIt,
           class StencilIt,
@@ -847,29 +846,22 @@ partition_copy(execution_policy<Derived> &policy,
                RejectedOutIt              rejected_result,
                Predicate                  predicate)
 {
-  pair<SelectedOutIt, RejectedOutIt> ret = thrust::make_pair(selected_result, rejected_result);
-  if (__THRUST_HAS_CUDART__)
-  {
-    ret = __partition::partition(policy,
-                            first,
-                            last,
-                            stencil,
-                            selected_result,
-                            rejected_result,
-                            predicate);
-  }
-  else
-  {
-#if !__THRUST_HAS_CUDART__
-    ret = thrust::partition_copy(cvt_to_seq(derived_cast(policy)),
-                                 first,
-                                 last,
-                                 stencil,
-                                 selected_result,
-                                 rejected_result,
-                                 predicate);
-#endif
-  }
+  auto ret = thrust::make_pair(selected_result, rejected_result);
+  THRUST_CDP_DISPATCH(
+    (ret = __partition::partition(policy,
+                                  first,
+                                  last,
+                                  stencil,
+                                  selected_result,
+                                  rejected_result,
+                                  predicate);),
+    (ret = thrust::partition_copy(cvt_to_seq(derived_cast(policy)),
+                                  first,
+                                  last,
+                                  stencil,
+                                  selected_result,
+                                  rejected_result,
+                                  predicate);));
   return ret;
 }
 
@@ -887,28 +879,21 @@ partition_copy(execution_policy<Derived> &policy,
                RejectedOutIt              rejected_result,
                Predicate                  predicate)
 {
-  pair<SelectedOutIt, RejectedOutIt> ret = thrust::make_pair(selected_result, rejected_result);
-  if (__THRUST_HAS_CUDART__)
-  {
-    ret = __partition::partition(policy,
-                                 first,
-                                 last,
-                                 __partition::no_stencil_tag(),
-                                 selected_result,
-                                 rejected_result,
-                                 predicate);
-  }
-  else
-  {
-#if !__THRUST_HAS_CUDART__
-    ret = thrust::partition_copy(cvt_to_seq(derived_cast(policy)),
-                                 first,
-                                 last,
-                                 selected_result,
-                                 rejected_result,
-                                 predicate);
-#endif
-  }
+  auto ret = thrust::make_pair(selected_result, rejected_result);
+  THRUST_CDP_DISPATCH(
+    (ret = __partition::partition(policy,
+                                  first,
+                                  last,
+                                  __partition::no_stencil_tag(),
+                                  selected_result,
+                                  rejected_result,
+                                  predicate);),
+    (ret = thrust::partition_copy(cvt_to_seq(derived_cast(policy)),
+                                  first,
+                                  last,
+                                  selected_result,
+                                  rejected_result,
+                                  predicate);));
   return ret;
 }
 
@@ -926,28 +911,21 @@ stable_partition_copy(execution_policy<Derived> &policy,
                       RejectedOutIt              rejected_result,
                       Predicate                  predicate)
 {
-  pair<SelectedOutIt, RejectedOutIt> ret = thrust::make_pair(selected_result, rejected_result);
-  if (__THRUST_HAS_CUDART__)
-  {
-    ret = __partition::partition(policy,
-                                 first,
-                                 last,
-                                 __partition::no_stencil_tag(),
-                                 selected_result,
-                                 rejected_result,
-                                 predicate);
-  }
-  else
-  {
-#if !__THRUST_HAS_CUDART__
-    ret = thrust::stable_partition_copy(cvt_to_seq(derived_cast(policy)),
-                                        first,
-                                        last,
-                                        selected_result,
-                                        rejected_result,
-                                        predicate);
-#endif
-  }
+  auto ret = thrust::make_pair(selected_result, rejected_result);
+  THRUST_CDP_DISPATCH(
+    (ret = __partition::partition(policy,
+                                  first,
+                                  last,
+                                  __partition::no_stencil_tag(),
+                                  selected_result,
+                                  rejected_result,
+                                  predicate);),
+    (ret = thrust::stable_partition_copy(cvt_to_seq(derived_cast(policy)),
+                                         first,
+                                         last,
+                                         selected_result,
+                                         rejected_result,
+                                         predicate);));
   return ret;
 }
 
@@ -967,29 +945,22 @@ stable_partition_copy(execution_policy<Derived> &policy,
                       RejectedOutIt              rejected_result,
                       Predicate                  predicate)
 {
-  pair<SelectedOutIt, RejectedOutIt> ret = thrust::make_pair(selected_result, rejected_result);
-  if (__THRUST_HAS_CUDART__)
-  {
-    ret = __partition::partition(policy,
-                                 first,
-                                 last,
-                                 stencil,
-                                 selected_result,
-                                 rejected_result,
-                                 predicate);
-  }
-  else
-  {
-#if !__THRUST_HAS_CUDART__
-    ret = thrust::stable_partition_copy(cvt_to_seq(derived_cast(policy)),
-                                        first,
-                                        last,
-                                        stencil,
-                                        selected_result,
-                                        rejected_result,
-                                        predicate);
-#endif
-  }
+  auto ret = thrust::make_pair(selected_result, rejected_result);
+  THRUST_CDP_DISPATCH(
+    (ret = __partition::partition(policy,
+                                  first,
+                                  last,
+                                  stencil,
+                                  selected_result,
+                                  rejected_result,
+                                  predicate);),
+    (ret = thrust::stable_partition_copy(cvt_to_seq(derived_cast(policy)),
+                                         first,
+                                         last,
+                                         stencil,
+                                         selected_result,
+                                         rejected_result,
+                                         predicate);));
   return ret;
 }
 
@@ -1007,22 +978,15 @@ partition(execution_policy<Derived> &policy,
           StencilIt                  stencil,
           Predicate                  predicate)
 {
-  Iterator ret = first;
-  if (__THRUST_HAS_CUDART__)
-  {
-    ret = __partition::partition_inplace(policy, first, last, stencil, predicate);
-  }
-  else
-  {
-#if !__THRUST_HAS_CUDART__
-    ret = thrust::partition(cvt_to_seq(derived_cast(policy)),
-                            first,
-                            last,
-                            stencil,
-                            predicate);
-#endif
-  }
-  return ret;
+  THRUST_CDP_DISPATCH(
+    (last =
+       __partition::partition_inplace(policy, first, last, stencil, predicate);),
+    (last = thrust::partition(cvt_to_seq(derived_cast(policy)),
+                              first,
+                              last,
+                              stencil,
+                              predicate);));
+  return last;
 }
 
 __thrust_exec_check_disable__
@@ -1035,25 +999,17 @@ partition(execution_policy<Derived> &policy,
           Iterator                   last,
           Predicate                  predicate)
 {
-  Iterator ret = first;
-  if (__THRUST_HAS_CUDART__)
-  {
-    ret = __partition::partition_inplace(policy,
-                                         first,
-                                         last,
-                                         __partition::no_stencil_tag(),
-                                         predicate);
-  }
-  else
-  {
-#if !__THRUST_HAS_CUDART__
-    ret = thrust::partition(cvt_to_seq(derived_cast(policy)),
-                            first,
-                            last,
-                            predicate);
-#endif
-  }
-  return ret;
+  THRUST_CDP_DISPATCH(
+    (last = __partition::partition_inplace(policy,
+                                           first,
+                                           last,
+                                           __partition::no_stencil_tag(),
+                                           predicate);),
+    (last = thrust::partition(cvt_to_seq(derived_cast(policy)),
+                              first,
+                              last,
+                              predicate);));
+  return last;
 }
 
 __thrust_exec_check_disable__
@@ -1068,30 +1024,20 @@ stable_partition(execution_policy<Derived> &policy,
                  StencilIt                  stencil,
                  Predicate                  predicate)
 {
-  Iterator result = first;
-  if (__THRUST_HAS_CUDART__)
-  {
-    result = __partition::partition_inplace(policy,
+  auto ret = last;
+  THRUST_CDP_DISPATCH(
+    (ret =
+       __partition::partition_inplace(policy, first, last, stencil, predicate);
+
+     /* partition returns rejected values in reverse order
+       so reverse the rejected elements to make it stable */
+     cuda_cub::reverse(policy, ret, last);),
+    (ret = thrust::stable_partition(cvt_to_seq(derived_cast(policy)),
                                     first,
                                     last,
                                     stencil,
-                                    predicate);
-
-    // partition returns rejected values in reverese order
-    // so reverse the rejected elements to make it stable
-    cuda_cub::reverse(policy, result, last);
-  }
-  else
-  {
-#if !__THRUST_HAS_CUDART__
-    result = thrust::stable_partition(cvt_to_seq(derived_cast(policy)),
-                                      first,
-                                      last,
-                                      stencil,
-                                      predicate);
-#endif
-  }
-  return result;
+                                    predicate);));
+  return ret;
 }
 
 __thrust_exec_check_disable__
@@ -1104,29 +1050,22 @@ stable_partition(execution_policy<Derived> &policy,
                  Iterator                   last,
                  Predicate                  predicate)
 {
-  Iterator result = first;
-  if (__THRUST_HAS_CUDART__)
-  {
-    result = __partition::partition_inplace(policy,
-                                       first,
-                                       last,
-                                       __partition::no_stencil_tag(),
-                                       predicate);
-
-    // partition returns rejected values in reverese order
-    // so reverse the rejected elements to make it stable
-    cuda_cub::reverse(policy, result, last);
-  }
-  else
-  {
-#if !__THRUST_HAS_CUDART__
-    result = thrust::stable_partition(cvt_to_seq(derived_cast(policy)),
-                                      first,
-                                      last,
-                                      predicate);
-#endif
-  }
-  return result;
+  auto ret = last;
+  THRUST_CDP_DISPATCH(
+    (ret = __partition::partition_inplace(policy,
+                                          first,
+                                          last,
+                                          __partition::no_stencil_tag(),
+                                          predicate);
+
+     /* partition returns rejected values in reverse order
+      so reverse the rejected elements to make it stable */
+     cuda_cub::reverse(policy, ret, last);),
+    (ret = thrust::stable_partition(cvt_to_seq(derived_cast(policy)),
+                                    first,
+                                    last,
+                                    predicate);));
+  return ret;
 }
 
 template <class Derived,
@@ -1145,5 +1084,5 @@ is_partitioned(execution_policy<Derived> &policy,
 
 
 }    // namespace cuda_cub
-THRUST_END_NS
+THRUST_NAMESPACE_END
 #endif
diff --git a/thrust/system/cuda/detail/per_device_resource.h b/thrust/system/cuda/detail/per_device_resource.h
index 528ac221d..414ea7788 100644
--- a/thrust/system/cuda/detail/per_device_resource.h
+++ b/thrust/system/cuda/detail/per_device_resource.h
@@ -43,7 +43,7 @@
 #include <mutex>
 #include <unordered_map>
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
 
 namespace cuda_cub
 {
@@ -64,7 +64,7 @@ MR * get_per_device_resource(execution_policy<DerivedPolicy>&)
 
 }
 
-THRUST_END_NS
+THRUST_NAMESPACE_END
 
 #endif
 
diff --git a/thrust/system/cuda/detail/pointer.inl b/thrust/system/cuda/detail/pointer.inl
deleted file mode 100644
index 60f277f59..000000000
--- a/thrust/system/cuda/detail/pointer.inl
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- *  Copyright 2008-2018 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-namespace thrust
-{
-
-// XXX WAR an issue with MSVC 2005 (cl v14.00) incorrectly implementing
-//     pointer_raw_pointer for pointer by specializing it here
-//     note that we specialize it here, before the use of raw_pointer_cast
-//     below, which causes pointer_raw_pointer's instantiation
-#if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC) && (_MSC_VER <= 1400)
-namespace detail
-{
-
-template<typename T>
-  struct pointer_raw_pointer< thrust::cuda_cub::pointer<T> >
-{
-  typedef typename thrust::cuda_cub::pointer<T>::raw_pointer type;
-}; // end pointer_raw_pointer
-
-} // end detail
-#endif
-
-namespace cuda_cub {
-
-template <typename T>
-template <typename OtherT>
-__host__ __device__ reference<T> &reference<T>::operator=(
-    const reference<OtherT> &other) {
-  return super_t::operator=(other);
-} // end reference::operator=()
-
-template <typename T>
-__host__ __device__ reference<T> &reference<T>::operator=(const value_type &x) {
-  return super_t::operator=(x);
-} // end reference::operator=()
-
-template<typename T>
-__host__ __device__
-void swap(reference<T> a, reference<T> b)
-{
-  a.swap(b);
-} // end swap()
-
-} // end cuda_cub
-} // end thrust
diff --git a/thrust/system/cuda/detail/reduce.h b/thrust/system/cuda/detail/reduce.h
index d6965258b..41d9075da 100644
--- a/thrust/system/cuda/detail/reduce.h
+++ b/thrust/system/cuda/detail/reduce.h
@@ -26,29 +26,35 @@
  ******************************************************************************/
 #pragma once
 
+#include <thrust/detail/config.h>
 
 #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
-#include <thrust/system/cuda/config.h>
 
+#include <thrust/detail/alignment.h>
 #include <thrust/detail/cstdint.h>
-#include <thrust/detail/temporary_array.h>
-#include <thrust/system/cuda/detail/util.h>
+#include <thrust/detail/minmax.h>
 #include <thrust/detail/raw_reference_cast.h>
+#include <thrust/detail/temporary_array.h>
 #include <thrust/detail/type_traits/iterator/is_output_iterator.h>
-#include <thrust/system/cuda/detail/cub/device/device_reduce.cuh>
-#include <thrust/system/cuda/detail/par_to_seq.h>
-#include <thrust/system/cuda/detail/get_value.h>
+#include <thrust/distance.h>
 #include <thrust/functional.h>
+#include <thrust/system/cuda/config.h>
+#include <thrust/system/cuda/detail/cdp_dispatch.h>
 #include <thrust/system/cuda/detail/core/agent_launcher.h>
-#include <thrust/detail/minmax.h>
-#include <thrust/distance.h>
-#include <thrust/detail/alignment.h>
+#include <thrust/system/cuda/detail/dispatch.h>
+#include <thrust/system/cuda/detail/get_value.h>
+#include <thrust/system/cuda/detail/make_unsigned_special.h>
+#include <thrust/system/cuda/detail/par_to_seq.h>
+#include <thrust/system/cuda/detail/util.h>
 
-THRUST_BEGIN_NS
+#include <cub/device/device_reduce.cuh>
+#include <cub/util_math.cuh>
+
+THRUST_NAMESPACE_BEGIN
 
 // forward declare generic reduce
-// to circumvent circular dependency 
-template <typename DerivedPolicy, 
+// to circumvent circular dependency
+template <typename DerivedPolicy,
           typename InputIterator,
           typename T,
           typename BinaryFunction>
@@ -63,9 +69,6 @@ namespace cuda_cub {
 
 namespace __reduce {
 
-  // XXX should GridSizeType also be able accomodate 64 bit integers
-  typedef int GridSizeType;
-
   template<bool>
   struct is_true : thrust::detail::false_type {};
   template<>
@@ -76,27 +79,25 @@ namespace __reduce {
             int                       _VECTOR_LOAD_LENGTH = 1,
             cub::BlockReduceAlgorithm _BLOCK_ALGORITHM    = cub::BLOCK_REDUCE_RAKING,
             cub::CacheLoadModifier    _LOAD_MODIFIER      = cub::LOAD_DEFAULT,
-            cub::GridMappingStrategy  _GRID_MAPPING       = cub::GRID_MAPPING_DYNAMIC,
-            int                       _MIN_BLOCKS         = 1>
+            cub::GridMappingStrategy  _GRID_MAPPING       = cub::GRID_MAPPING_DYNAMIC>
   struct PtxPolicy
   {
     enum
     {
-      BLOCK_THREADS      = _BLOCK_THREADS,        
-      ITEMS_PER_THREAD   = _ITEMS_PER_THREAD,    
-      VECTOR_LOAD_LENGTH = _VECTOR_LOAD_LENGTH, 
-      MIN_BLOCKS         = _MIN_BLOCKS,
+      BLOCK_THREADS      = _BLOCK_THREADS,
+      ITEMS_PER_THREAD   = _ITEMS_PER_THREAD,
+      VECTOR_LOAD_LENGTH = _VECTOR_LOAD_LENGTH,
       ITEMS_PER_TILE     = _BLOCK_THREADS * _ITEMS_PER_THREAD
     };
 
-    static const cub::BlockReduceAlgorithm BLOCK_ALGORITHM = _BLOCK_ALGORITHM;    
-    static const cub::CacheLoadModifier    LOAD_MODIFIER   = _LOAD_MODIFIER;     
-    static const cub::GridMappingStrategy  GRID_MAPPING    = _GRID_MAPPING;     
+    static const cub::BlockReduceAlgorithm BLOCK_ALGORITHM = _BLOCK_ALGORITHM;
+    static const cub::CacheLoadModifier    LOAD_MODIFIER   = _LOAD_MODIFIER;
+    static const cub::GridMappingStrategy  GRID_MAPPING    = _GRID_MAPPING;
   }; // struct PtxPolicy
 
   template<class,class>
   struct Tuning;
-  
+
   template <class T>
   struct Tuning<sm30, T>
   {
@@ -108,34 +109,34 @@ namespace __reduce {
       SCALE_FACTOR_1B = sizeof(T),
     };
 
-    typedef PtxPolicy<256,                                 
-                      CUB_MAX(1, 20 / SCALE_FACTOR_4B),   
-                      2,                                 
-                      cub::BLOCK_REDUCE_WARP_REDUCTIONS,    
-                      cub::LOAD_DEFAULT,                   
-                      cub::GRID_MAPPING_RAKE>       
+    typedef PtxPolicy<256,
+                      CUB_MAX(1, 20 / SCALE_FACTOR_4B),
+                      2,
+                      cub::BLOCK_REDUCE_WARP_REDUCTIONS,
+                      cub::LOAD_DEFAULT,
+                      cub::GRID_MAPPING_RAKE>
         type;
   }; // Tuning sm30
-  
+
   template <class T>
   struct Tuning<sm35, T> : Tuning<sm30,T>
   {
     // ReducePolicy1B (GTX Titan: 228.7 GB/s @ 192M 1B items)
-    typedef PtxPolicy<128,                                 
-                      CUB_MAX(1, 24 / Tuning::SCALE_FACTOR_1B),   
-                      4,                                 
-                      cub::BLOCK_REDUCE_WARP_REDUCTIONS,    
-                      cub::LOAD_LDG,                       
-                      cub::GRID_MAPPING_DYNAMIC>          
+    typedef PtxPolicy<128,
+                      CUB_MAX(1, 24 / Tuning::SCALE_FACTOR_1B),
+                      4,
+                      cub::BLOCK_REDUCE_WARP_REDUCTIONS,
+                      cub::LOAD_LDG,
+                      cub::GRID_MAPPING_DYNAMIC>
         ReducePolicy1B;
 
     // ReducePolicy4B types (GTX Titan: 255.1 GB/s @ 48M 4B items)
-    typedef PtxPolicy<256,                                 
-                      CUB_MAX(1, 20 / Tuning::SCALE_FACTOR_4B),   
-                      4,                                 
-                      cub::BLOCK_REDUCE_WARP_REDUCTIONS,     
-                      cub::LOAD_LDG,                        
-                      cub::GRID_MAPPING_DYNAMIC>           
+    typedef PtxPolicy<256,
+                      CUB_MAX(1, 20 / Tuning::SCALE_FACTOR_4B),
+                      4,
+                      cub::BLOCK_REDUCE_WARP_REDUCTIONS,
+                      cub::LOAD_LDG,
+                      cub::GRID_MAPPING_DYNAMIC>
         ReducePolicy4B;
 
     typedef typename thrust::detail::conditional<(sizeof(T) < 4),
@@ -150,6 +151,8 @@ namespace __reduce {
             class ReductionOp>
   struct ReduceAgent
   {
+    typedef typename detail::make_unsigned_special<Size>::type UnsignedSize;
+
     template<class Arch>
     struct PtxPlan : Tuning<Arch,T>::type
     {
@@ -194,6 +197,9 @@ namespace __reduce {
     {
       cub::GridMappingStrategy grid_mapping;
 
+      THRUST_RUNTIME_FUNCTION
+      Plan() {}
+
       template <class P>
       THRUST_RUNTIME_FUNCTION
           Plan(P) : core::AgentPlan(P()),
@@ -201,7 +207,7 @@ namespace __reduce {
       {
       }
     };
-   
+
     // this specialized PtxPlan for a device-compiled Arch
     // ptx_plan type *must* only be used from device code
     // Its use from host code will result in *undefined behaviour*
@@ -458,8 +464,8 @@ namespace __reduce {
       //
       THRUST_DEVICE_FUNCTION T
       consume_tiles(Size /*num_items*/,
-                    cub::GridEvenShare<GridSizeType> &even_share,
-                    cub::GridQueue<GridSizeType> & /*queue*/,
+                    cub::GridEvenShare<Size> &even_share,
+                    cub::GridQueue<UnsignedSize> & /*queue*/,
                     thrust::detail::integral_constant<cub::GridMappingStrategy, cub::GRID_MAPPING_RAKE> /*is_rake*/)
       {
         typedef is_true<ATTEMPT_VECTORIZATION>          attempt_vec;
@@ -489,7 +495,7 @@ namespace __reduce {
       template <class CAN_VECTORIZE>
       THRUST_DEVICE_FUNCTION T
       consume_tiles_impl(Size                         num_items,
-                         cub::GridQueue<GridSizeType> queue,
+                         cub::GridQueue<UnsignedSize> queue,
                          CAN_VECTORIZE                can_vectorize)
       {
         using core::sync_threadblock;
@@ -576,8 +582,8 @@ namespace __reduce {
       THRUST_DEVICE_FUNCTION T
       consume_tiles(
           Size                              num_items,
-          cub::GridEvenShare<GridSizeType> &/*even_share*/,
-          cub::GridQueue<GridSizeType> &    queue,
+          cub::GridEvenShare<Size> &/*even_share*/,
+          cub::GridQueue<UnsignedSize> &    queue,
           thrust::detail::integral_constant<cub::GridMappingStrategy, cub::GRID_MAPPING_DYNAMIC>)
       {
         typedef is_true<ATTEMPT_VECTORIZATION>         attempt_vec;
@@ -589,7 +595,7 @@ namespace __reduce {
                    : consume_tiles_impl(num_items, queue, path_b());
       }
     };    // struct impl
-    
+
     //---------------------------------------------------------------------
     // Agent entry points
     //---------------------------------------------------------------------
@@ -644,8 +650,8 @@ namespace __reduce {
     THRUST_AGENT_ENTRY(InputIt                          input_it,
                        OutputIt                         output_it,
                        Size                             num_items,
-                       cub::GridEvenShare<GridSizeType> even_share,
-                       cub::GridQueue<GridSizeType>     queue,
+                       cub::GridEvenShare<Size> even_share,
+                       cub::GridQueue<UnsignedSize>     queue,
                        ReductionOp                      reduction_op,
                        char *                           shmem)
     {
@@ -665,6 +671,8 @@ namespace __reduce {
   template<class Size>
   struct DrainAgent
   {
+    typedef typename detail::make_unsigned_special<Size>::type UnsignedSize;
+
     template <class Arch>
     struct PtxPlan : PtxPolicy<1> {};
     typedef core::specialize_plan<PtxPlan> ptx_plan;
@@ -673,7 +681,7 @@ namespace __reduce {
     // Agent entry point
     //---------------------------------------------------------------------
 
-    THRUST_AGENT_ENTRY(cub::GridQueue<GridSizeType> grid_queue,
+    THRUST_AGENT_ENTRY(cub::GridQueue<UnsignedSize> grid_queue,
                        Size                         num_items,
                        char * /*shmem*/)
     {
@@ -695,14 +703,15 @@ namespace __reduce {
             T            init,
             ReductionOp  reduction_op,
             OutputIt     output_it,
-            cudaStream_t stream,
-            bool         debug_sync)
+            cudaStream_t stream)
   {
     using core::AgentPlan;
     using core::AgentLauncher;
     using core::get_agent_plan;
     using core::cuda_optional;
 
+    typedef typename detail::make_unsigned_special<Size>::type UnsignedSize;
+
     if (num_items == 0)
       return cudaErrorNotSupported;
 
@@ -727,7 +736,7 @@ namespace __reduce {
       }
       char *vshmem_ptr = vshmem_size > 0 ? (char*)d_temp_storage : NULL;
 
-      reduce_agent ra(reduce_plan, num_items, stream, vshmem_ptr, "reduce_agent: single_tile only", debug_sync);
+      reduce_agent ra(reduce_plan, num_items, stream, vshmem_ptr, "reduce_agent: single_tile only");
       ra.launch(input_it, output_it, num_items, reduction_op, init);
       CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
     }
@@ -743,8 +752,8 @@ namespace __reduce {
               template get_max_blocks_per_sm<InputIt,
                                              OutputIt,
                                              Size,
-                                             cub::GridEvenShare<GridSizeType>,
-                                             cub::GridQueue<GridSizeType>,
+                                             cub::GridEvenShare<Size>,
+                                             cub::GridQueue<UnsignedSize>,
                                              ReductionOp>(reduce_plan);
       CUDA_CUB_RET_IF_FAIL(max_blocks_per_sm.status());
 
@@ -755,7 +764,7 @@ namespace __reduce {
       int sm_oversubscription = 5;
       int max_blocks          = reduce_device_occupancy * sm_oversubscription;
 
-      cub::GridEvenShare<GridSizeType> even_share;
+      cub::GridEvenShare<Size> even_share;
       even_share.DispatchInit(static_cast<int>(num_items), max_blocks,
                               reduce_plan.items_per_tile);
 
@@ -770,7 +779,7 @@ namespace __reduce {
       size_t allocation_sizes[3] =
           {
               max_blocks * sizeof(T),                            // bytes needed for privatized block reductions
-              cub::GridQueue<GridSizeType>::AllocationSize(),    // bytes needed for grid queue descriptor0
+              cub::GridQueue<UnsignedSize>::AllocationSize(),    // bytes needed for grid queue descriptor0
               vshmem_size                                        // size of virtualized shared memory storage
           };
       status = cub::AliasTemporaries(d_temp_storage,
@@ -784,7 +793,7 @@ namespace __reduce {
       }
 
       T *d_block_reductions = (T*) allocations[0];
-      cub::GridQueue<GridSizeType> queue(allocations[1]);
+      cub::GridQueue<UnsignedSize> queue(allocations[1]);
       char *vshmem_ptr = vshmem_size > 0 ? (char *)allocations[2] : NULL;
 
 
@@ -798,17 +807,16 @@ namespace __reduce {
       else if (reduce_plan.grid_mapping == cub::GRID_MAPPING_DYNAMIC)
       {
         // Work is distributed dynamically
-        size_t num_tiles = (num_items + reduce_plan.items_per_tile - 1) /
-          reduce_plan.items_per_tile;
+        size_t num_tiles = cub::DivideAndRoundUp(num_items, reduce_plan.items_per_tile);
 
         // if not enough to fill the device with threadblocks
         // then fill the device with threadblocks
-        reduce_grid_size = static_cast<int>(min(num_tiles, static_cast<size_t>(reduce_device_occupancy)));
+        reduce_grid_size = static_cast<int>((min)(num_tiles, static_cast<size_t>(reduce_device_occupancy)));
 
         typedef AgentLauncher<DrainAgent<Size> > drain_agent;
         AgentPlan drain_plan = drain_agent::get_plan();
         drain_plan.grid_size = 1;
-        drain_agent da(drain_plan, stream, "__reduce::drain_agent", debug_sync);
+        drain_agent da(drain_plan, stream, "__reduce::drain_agent");
         da.launch(queue, num_items);
         CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
       }
@@ -818,7 +826,7 @@ namespace __reduce {
       }
 
       reduce_plan.grid_size = reduce_grid_size;
-      reduce_agent ra(reduce_plan, stream, vshmem_ptr, "reduce_agent: regular size reduce", debug_sync);
+      reduce_agent ra(reduce_plan, stream, vshmem_ptr, "reduce_agent: regular size reduce");
       ra.launch(input_it,
                 d_block_reductions,
                 num_items,
@@ -833,7 +841,7 @@ namespace __reduce {
         reduce_agent_single;
 
       reduce_plan.grid_size = 1;
-      reduce_agent_single ra1(reduce_plan, stream, vshmem_ptr, "reduce_agent: single tile reduce", debug_sync);
+      reduce_agent_single ra1(reduce_plan, stream, vshmem_ptr, "reduce_agent: single tile reduce");
 
       ra1.launch(d_block_reductions, output_it, reduce_grid_size, reduction_op, init);
       CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
@@ -860,7 +868,6 @@ namespace __reduce {
 
     size_t       temp_storage_bytes = 0;
     cudaStream_t stream             = cuda_cub::stream(policy);
-    bool         debug_sync         = THRUST_DEBUG_SYNC_FLAG;
 
     cudaError_t status;
     status = doit_step(NULL,
@@ -870,8 +877,7 @@ namespace __reduce {
                        init,
                        binary_op,
                        reinterpret_cast<T*>(NULL),
-                       stream,
-                       debug_sync);
+                       stream);
     cuda_cub::throw_on_error(status, "reduce failed on 1st step");
 
     size_t allocation_sizes[2] = {sizeof(T*), temp_storage_bytes};
@@ -904,8 +910,7 @@ namespace __reduce {
                        init,
                        binary_op,
                        d_result,
-                       stream,
-                       debug_sync);
+                       stream);
     cuda_cub::throw_on_error(status, "reduce failed on 2nd step");
 
     status = cuda_cub::synchronize(policy);
@@ -932,21 +937,18 @@ T reduce_n_impl(execution_policy<Derived>& policy,
                 BinaryOp                   binary_op)
 {
   cudaStream_t stream = cuda_cub::stream(policy);
+  cudaError_t status;
 
   // Determine temporary device storage requirements.
 
   size_t tmp_size = 0;
-  cuda_cub::throw_on_error(
-    cub::DeviceReduce::Reduce(NULL,
-                              tmp_size,
-                              first,
-                              reinterpret_cast<T*>(NULL),
-                              num_items,
-                              binary_op,
-                              init,
-                              stream,
-                              THRUST_DEBUG_SYNC_FLAG),
-    "after reduction step 1");
+
+  THRUST_INDEX_TYPE_DISPATCH(status,
+    cub::DeviceReduce::Reduce,
+    num_items,
+    (NULL, tmp_size, first, reinterpret_cast<T*>(NULL),
+        num_items_fixed, binary_op, init, stream));
+  cuda_cub::throw_on_error(status, "after reduction step 1");
 
   // Allocate temporary storage.
 
@@ -965,22 +967,17 @@ T reduce_n_impl(execution_policy<Derived>& policy,
   // make this guarantee.
   T* ret_ptr = thrust::detail::aligned_reinterpret_cast<T*>(tmp.data().get());
   void* tmp_ptr = static_cast<void*>((tmp.data() + sizeof(T)).get());
-  cuda_cub::throw_on_error(
-    cub::DeviceReduce::Reduce(tmp_ptr,
-                              tmp_size,
-                              first,
-                              ret_ptr,
-                              num_items,
-                              binary_op,
-                              init,
-                              stream,
-                              THRUST_DEBUG_SYNC_FLAG),
-    "after reduction step 2");
+  THRUST_INDEX_TYPE_DISPATCH(status,
+    cub::DeviceReduce::Reduce,
+    num_items,
+    (tmp_ptr, tmp_size, first, ret_ptr,
+        num_items_fixed, binary_op, init, stream));
+  cuda_cub::throw_on_error(status, "after reduction step 2");
 
   // Synchronize the stream and get the value.
 
-  cuda_cub::throw_on_error(cuda_cub::synchronize(policy),
-    "reduce failed to synchronize");
+  status = cuda_cub::synchronize(policy);
+  cuda_cub::throw_on_error(status, "reduce failed to synchronize");
 
   // `tmp.begin()` yields a `normal_iterator`, which dereferences to a
   // `reference`, which has an `operator&` that returns a `pointer`, which
@@ -1000,7 +997,7 @@ T reduce_n_impl(execution_policy<Derived>& policy,
 // Thrust API entry points
 //-------------------------
 
-__thrust_exec_check_disable__ 
+__thrust_exec_check_disable__
 template <typename Derived,
           typename InputIt,
           typename Size,
@@ -1013,14 +1010,18 @@ T reduce_n(execution_policy<Derived>& policy,
            T                          init,
            BinaryOp                   binary_op)
 {
-  if (__THRUST_HAS_CUDART__)
-    return thrust::cuda_cub::detail::reduce_n_impl(
-      policy, first, num_items, init, binary_op);
-
-  #if !__THRUST_HAS_CUDART__
-    return thrust::reduce(
-      cvt_to_seq(derived_cast(policy)), first, first + num_items, init, binary_op);
-  #endif
+  THRUST_CDP_DISPATCH((init =
+                         thrust::cuda_cub::detail::reduce_n_impl(policy,
+                                                                 first,
+                                                                 num_items,
+                                                                 init,
+                                                                 binary_op);),
+                      (init = thrust::reduce(cvt_to_seq(derived_cast(policy)),
+                                             first,
+                                             first + num_items,
+                                             init,
+                                             binary_op);));
+  return init;
 }
 
 template <class Derived, class InputIt, class T, class BinaryOp>
@@ -1064,7 +1065,7 @@ reduce(execution_policy<Derived> &policy,
 
 } // namespace cuda_cub
 
-THRUST_END_NS
+THRUST_NAMESPACE_END
 
 #include <thrust/memory.h>
 #include <thrust/reduce.h>
diff --git a/thrust/system/cuda/detail/reduce_by_key.h b/thrust/system/cuda/detail/reduce_by_key.h
index 229b1dc40..2933d062a 100644
--- a/thrust/system/cuda/detail/reduce_by_key.h
+++ b/thrust/system/cuda/detail/reduce_by_key.h
@@ -26,28 +26,32 @@
  ******************************************************************************/
 #pragma once
 
+#include <thrust/detail/config.h>
 
 #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
-#include <thrust/system/cuda/config.h>
-#include <thrust/detail/type_traits.h>
 
+#include <thrust/detail/alignment.h>
 #include <thrust/detail/cstdint.h>
-#include <thrust/detail/temporary_array.h>
-#include <thrust/system/cuda/detail/util.h>
+#include <thrust/detail/minmax.h>
+#include <thrust/detail/mpl/math.h>
 #include <thrust/detail/raw_reference_cast.h>
+#include <thrust/detail/temporary_array.h>
 #include <thrust/detail/type_traits/iterator/is_output_iterator.h>
-#include <thrust/system/cuda/detail/cub/device/device_reduce.cuh>
-#include <thrust/system/cuda/detail/par_to_seq.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/distance.h>
+#include <thrust/functional.h>
+#include <thrust/pair.h>
+#include <thrust/system/cuda/config.h>
+#include <thrust/system/cuda/detail/cdp_dispatch.h>
 #include <thrust/system/cuda/detail/core/agent_launcher.h>
 #include <thrust/system/cuda/detail/get_value.h>
-#include <thrust/pair.h>
-#include <thrust/functional.h>
-#include <thrust/detail/mpl/math.h>
-#include <thrust/detail/minmax.h>
-#include <thrust/distance.h>
-#include <thrust/detail/alignment.h>
+#include <thrust/system/cuda/detail/par_to_seq.h>
+#include <thrust/system/cuda/detail/util.h>
+
+#include <cub/device/device_reduce.cuh>
+#include <cub/util_math.cuh>
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
 
 template <typename DerivedPolicy,
           typename InputIterator1,
@@ -68,7 +72,7 @@ reduce_by_key(
 namespace cuda_cub {
 
 namespace __reduce_by_key {
-  
+
   template<bool> struct is_true : thrust::detail::false_type {};
   template<> struct is_true<true> : thrust::detail::true_type {};
 
@@ -78,16 +82,14 @@ namespace __reduce_by_key {
             int                     _ITEMS_PER_THREAD = 1,
             cub::BlockLoadAlgorithm _LOAD_ALGORITHM   = cub::BLOCK_LOAD_DIRECT,
             cub::CacheLoadModifier  _LOAD_MODIFIER    = cub::LOAD_DEFAULT,
-            cub::BlockScanAlgorithm _SCAN_ALGORITHM   = cub::BLOCK_SCAN_WARP_SCANS,
-            int                     _MIN_BLOCKS       = 1>
+            cub::BlockScanAlgorithm _SCAN_ALGORITHM   = cub::BLOCK_SCAN_WARP_SCANS>
   struct PtxPolicy
   {
     enum
     {
       BLOCK_THREADS    = _BLOCK_THREADS,
       ITEMS_PER_THREAD = _ITEMS_PER_THREAD,
-      ITEMS_PER_TILE   = BLOCK_THREADS * ITEMS_PER_THREAD,
-      MIN_BLOCKS       = _MIN_BLOCKS
+      ITEMS_PER_TILE   = BLOCK_THREADS * ITEMS_PER_THREAD
     };
 
     static const cub::BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM;
@@ -97,7 +99,7 @@ namespace __reduce_by_key {
 
   template <class Arch, class Key, class Value>
   struct Tuning;
-  
+
   template <class Key, class Value>
   struct Tuning<sm30, Key, Value>
   {
@@ -114,9 +116,9 @@ namespace __reduce_by_key {
           mpl::max<
               int,
               1,
-              ((NOMINAL_4B_ITEMS_PER_THREAD * 8) +
+              static_cast<int>(((NOMINAL_4B_ITEMS_PER_THREAD * 8) +
                COMBINED_INPUT_BYTES - 1) /
-                  COMBINED_INPUT_BYTES>::value>::value,
+                  COMBINED_INPUT_BYTES)>::value>::value,
     };
 
     typedef PtxPolicy<128,
@@ -132,10 +134,13 @@ namespace __reduce_by_key {
   {
     enum
     {
+      MAX_INPUT_BYTES      = mpl::max<size_t, sizeof(Key), sizeof(Value)>::value,
+      COMBINED_INPUT_BYTES = sizeof(Key) + sizeof(Value),
+
       NOMINAL_4B_ITEMS_PER_THREAD = 6,
 
       ITEMS_PER_THREAD =
-          (Tuning::MAX_INPUT_BYTES <= 8)
+          (MAX_INPUT_BYTES <= 8)
               ? 6
               : mpl::min<
                     int,
@@ -144,9 +149,9 @@ namespace __reduce_by_key {
                         int,
                         1,
                         ((NOMINAL_4B_ITEMS_PER_THREAD * 8) +
-                         Tuning::COMBINED_INPUT_BYTES - 1) /
-                            Tuning::COMBINED_INPUT_BYTES>::value>::value,
-    };  
+                         COMBINED_INPUT_BYTES - 1) /
+                            COMBINED_INPUT_BYTES>::value>::value,
+    };
 
     typedef PtxPolicy<128,
                       ITEMS_PER_THREAD,
@@ -155,16 +160,19 @@ namespace __reduce_by_key {
                       cub::BLOCK_SCAN_WARP_SCANS>
         type;
   };    // Tuning sm35
-  
+
   template<class Key, class Value>
   struct Tuning<sm52,Key,Value> : Tuning<sm30,Key,Value>
   {
     enum
     {
+      MAX_INPUT_BYTES      = mpl::max<size_t, sizeof(Key), sizeof(Value)>::value,
+      COMBINED_INPUT_BYTES = sizeof(Key) + sizeof(Value),
+
       NOMINAL_4B_ITEMS_PER_THREAD = 9,
 
       ITEMS_PER_THREAD =
-          (Tuning::MAX_INPUT_BYTES <= 8)
+          (MAX_INPUT_BYTES <= 8)
               ? 9
               : mpl::min<
                     int,
@@ -173,9 +181,9 @@ namespace __reduce_by_key {
                         int,
                         1,
                         ((NOMINAL_4B_ITEMS_PER_THREAD * 8) +
-                         Tuning::COMBINED_INPUT_BYTES - 1) /
-                            Tuning::COMBINED_INPUT_BYTES>::value>::value,
-    };  
+                         COMBINED_INPUT_BYTES - 1) /
+                            COMBINED_INPUT_BYTES>::value>::value,
+    };
 
     typedef PtxPolicy<256,
                       ITEMS_PER_THREAD,
@@ -238,12 +246,12 @@ namespace __reduce_by_key {
 
       union TempStorage
       {
-        struct
+        struct ScanStorage
         {
           typename BlockScan::TempStorage              scan;
           typename TilePrefixCallback::TempStorage     prefix;
           typename BlockDiscontinuityKeys::TempStorage discontinuity;
-        };
+        } scan_storage;
 
         typename BlockLoadKeys::TempStorage   load_keys;
         typename BlockLoadValues::TempStorage load_values;
@@ -307,7 +315,7 @@ namespace __reduce_by_key {
         size_value_pair_t identity;
         identity.value = 0;
         identity.key   = 0;
-        BlockScan(storage.scan)
+        BlockScan(storage.scan_storage.scan)
             .ExclusiveScan(scan_items, scan_items, identity, scan_op, tile_aggregate);
       }
 
@@ -319,7 +327,7 @@ namespace __reduce_by_key {
                 size_value_pair_t &tile_aggregate,
                 thrust::detail::false_type /* has_identity */)
       {
-        BlockScan(storage.scan)
+        BlockScan(storage.scan_storage.scan)
             .ExclusiveScan(scan_items, scan_items, scan_op, tile_aggregate);
       }
 
@@ -331,7 +339,7 @@ namespace __reduce_by_key {
                 TilePrefixCallback &prefix_op,
                 thrust::detail::true_type /*  has_identity */)
       {
-        BlockScan(storage.scan)
+        BlockScan(storage.scan_storage.scan)
             .ExclusiveScan(scan_items,
                            scan_items,
                            scan_op,
@@ -347,7 +355,7 @@ namespace __reduce_by_key {
                 TilePrefixCallback &prefix_op,
                 thrust::detail::false_type /* has_identity */)
       {
-        BlockScan(storage.scan)
+        BlockScan(storage.scan_storage.scan)
             .ExclusiveScan(scan_items,
                            scan_items,
                            scan_op,
@@ -400,7 +408,7 @@ namespace __reduce_by_key {
       //---------------------------------------------------------------------
       // Scatter utility methods
       //---------------------------------------------------------------------
-    
+
       // Directly scatter flagged items to output offsets
       // (specialized for IS_SEGMENTED_REDUCTION_FIXUP == false)
       THRUST_DEVICE_FUNCTION void scatter_direct(
@@ -424,7 +432,7 @@ namespace __reduce_by_key {
       // (specialized for IS_SEGMENTED_REDUCTION_FIXUP == false
       //
       // The exclusive scan causes each head flag to be paired with
-      // the previous value aggregate: 
+      // the previous value aggregate:
       //   * the scatter offsets must be decremented for value aggregates
       //
       THRUST_DEVICE_FUNCTION void scatter_two_phase(
@@ -444,8 +452,9 @@ namespace __reduce_by_key {
         {
           if (segment_flags[ITEM])
           {
-            storage.raw_exchange[segment_indices[ITEM] -
-                                 num_tile_segments_prefix] = scatter_items[ITEM];
+            int idx = static_cast<int>(segment_indices[ITEM] -
+                                       num_tile_segments_prefix);
+            storage.raw_exchange[idx] = scatter_items[ITEM];
           }
         }
 
@@ -503,7 +512,7 @@ namespace __reduce_by_key {
         // Last thread will output final count and last item, if necessary
         if (threadIdx.x == BLOCK_THREADS - 1)
         {
-          // If the last tile is a whole tile, the inclusive prefix 
+          // If the last tile is a whole tile, the inclusive prefix
           // contains accumulated value reduction for the last segment
           if (num_remaining == ITEMS_PER_TILE)
           {
@@ -517,7 +526,7 @@ namespace __reduce_by_key {
           *num_runs_output_it = num_segments;
         }
       }
-    
+
       //---------------------------------------------------------------------
       // Cooperatively scan a device-wide sequence of tiles with other CTAs
       //---------------------------------------------------------------------
@@ -580,7 +589,7 @@ namespace __reduce_by_key {
 
         // Set head segment_flags.
         // First tile sets the first flag for the first item
-        BlockDiscontinuityKeys(storage.discontinuity)
+        BlockDiscontinuityKeys(storage.scan_storage.discontinuity)
             .FlagHeads(segment_flags, keys, pred_keys, inequality_op);
 
         // Unset the flag for the first item in the first tile
@@ -605,7 +614,7 @@ namespace __reduce_by_key {
           if (!IS_LAST_TILE)
             tile_state.SetInclusive(0, tile_aggregate);
 
-          // Initialize the segment index for the first scan item if necessary 
+          // Initialize the segment index for the first scan item if necessary
           // (the exclusive prefix for the first item is garbage)
           if (!HAS_IDENTITY_ZERO)
             scan_items[0].key = 0;
@@ -694,7 +703,7 @@ namespace __reduce_by_key {
         sync_threadblock();
 
         // Set head segment_flags
-        BlockDiscontinuityKeys(storage.discontinuity)
+        BlockDiscontinuityKeys(storage.scan_storage.discontinuity)
             .FlagHeads(segment_flags,
                        keys,
                        pred_keys,
@@ -709,7 +718,7 @@ namespace __reduce_by_key {
 
         // Exclusive scan of values and segment_flags
         size_value_pair_t  tile_aggregate;
-        TilePrefixCallback prefix_op(tile_state, storage.prefix, scan_op, tile_idx);
+        TilePrefixCallback prefix_op(tile_state, storage.scan_storage.prefix, scan_op, tile_idx);
         scan_tile(scan_items,
                   tile_aggregate,
                   prefix_op,
@@ -785,7 +794,7 @@ namespace __reduce_by_key {
         // so just assign one tile per block
         //
         int  tile_idx          = blockIdx.x;
-        Size tile_offset       = tile_idx * ITEMS_PER_TILE;
+        Size tile_offset       = static_cast<Size>(tile_idx) * ITEMS_PER_TILE;
         Size num_remaining     = num_items - tile_offset;
 
         if (num_remaining > ITEMS_PER_TILE)
@@ -877,8 +886,7 @@ namespace __reduce_by_key {
             EqualityOp      equality_op,
             ReductionOp     reduction_op,
             Size            num_items,
-            cudaStream_t    stream,
-            bool            debug_sync)
+            cudaStream_t    stream)
   {
     using core::AgentPlan;
     using core::AgentLauncher;
@@ -910,7 +918,7 @@ namespace __reduce_by_key {
 
     // Number of input tiles
     int  tile_size = reduce_by_key_plan.items_per_tile;
-    Size num_tiles = (num_items + tile_size - 1) / tile_size;
+    Size num_tiles = cub::DivideAndRoundUp(num_items, tile_size);
 
     size_t vshmem_size = core::vshmem_size(reduce_by_key_plan.shared_memory_size,
                                            num_tiles);
@@ -930,12 +938,12 @@ namespace __reduce_by_key {
     {
       return status;
     }
-    
+
     ScanTileState tile_state;
     status = tile_state.Init(static_cast<int>(num_tiles), allocations[0], allocation_sizes[0]);
     CUDA_CUB_RET_IF_FAIL(status);
 
-    init_agent ia(init_plan, num_tiles, stream, "reduce_by_key::init_agent", debug_sync);
+    init_agent ia(init_plan, num_tiles, stream, "reduce_by_key::init_agent");
     ia.launch(tile_state, num_tiles, num_runs_output_it);
     CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
 
@@ -945,8 +953,7 @@ namespace __reduce_by_key {
                              num_items,
                              stream,
                              vshmem_ptr,
-                             "reduce_by_keys::reduce_by_key_agent",
-                             debug_sync);
+                             "reduce_by_keys::reduce_by_key_agent");
     rbka.launch(keys_input_it,
                 values_input_it,
                 keys_output_it,
@@ -961,7 +968,8 @@ namespace __reduce_by_key {
     return status;
   }
 
-  template <typename Derived,
+  template <typename Size,
+            typename Derived,
             typename KeysInputIt,
             typename ValuesInputIt,
             typename KeysOutputIt,
@@ -970,24 +978,22 @@ namespace __reduce_by_key {
             typename ReductionOp>
   THRUST_RUNTIME_FUNCTION
   pair<KeysOutputIt, ValuesOutputIt>
-  reduce_by_key(execution_policy<Derived>& policy,
-                KeysInputIt                keys_first,
-                KeysInputIt                keys_last,
-                ValuesInputIt              values_first,
-                KeysOutputIt               keys_output,
-                ValuesOutputIt             values_output,
-                EqualityOp                 equality_op,
-                ReductionOp                reduction_op)
+  reduce_by_key_dispatch(execution_policy<Derived>& policy,
+                         KeysInputIt                keys_first,
+                         Size                       num_items,
+                         ValuesInputIt              values_first,
+                         KeysOutputIt               keys_output,
+                         ValuesOutputIt             values_output,
+                         EqualityOp                 equality_op,
+                         ReductionOp                reduction_op)
   {
-    typedef int size_type;
-
-    size_type    num_items          = static_cast<size_type>(thrust::distance(keys_first, keys_last));
     size_t       temp_storage_bytes = 0;
     cudaStream_t stream             = cuda_cub::stream(policy);
-    bool         debug_sync         = THRUST_DEBUG_SYNC_FLAG;
-    
+
     if (num_items == 0)
+    {
       return thrust::make_pair(keys_output, values_output);
+    }
 
     cudaError_t status;
     status = doit_step(NULL,
@@ -996,15 +1002,14 @@ namespace __reduce_by_key {
                        values_first,
                        keys_output,
                        values_output,
-                       reinterpret_cast<size_type*>(NULL),
+                       reinterpret_cast<Size*>(NULL),
                        equality_op,
                        reduction_op,
                        num_items,
-                       stream,
-                       debug_sync);
+                       stream);
     cuda_cub::throw_on_error(status, "reduce_by_key failed on 1st step");
 
-    size_t allocation_sizes[2] = {sizeof(size_type), temp_storage_bytes};
+    size_t allocation_sizes[2] = {sizeof(Size), temp_storage_bytes};
     void * allocations[2]      = {NULL, NULL};
 
     size_t storage_size = 0;
@@ -1025,8 +1030,8 @@ namespace __reduce_by_key {
                                  allocation_sizes);
     cuda_cub::throw_on_error(status, "reduce failed on 2nd alias_storage");
 
-    size_type* d_num_runs_out
-      = thrust::detail::aligned_reinterpret_cast<size_type*>(allocations[0]);
+    Size* d_num_runs_out
+      = thrust::detail::aligned_reinterpret_cast<Size*>(allocations[0]);
 
     status = doit_step(allocations[1],
                        temp_storage_bytes,
@@ -1038,8 +1043,7 @@ namespace __reduce_by_key {
                        equality_op,
                        reduction_op,
                        num_items,
-                       stream,
-                       debug_sync);
+                       stream);
     cuda_cub::throw_on_error(status, "reduce_by_key failed on 2nd step");
 
     status = cuda_cub::synchronize(policy);
@@ -1053,13 +1057,57 @@ namespace __reduce_by_key {
     );
   }
 
+  template <typename Derived,
+            typename KeysInputIt,
+            typename ValuesInputIt,
+            typename KeysOutputIt,
+            typename ValuesOutputIt,
+            typename EqualityOp,
+            typename ReductionOp>
+  THRUST_RUNTIME_FUNCTION
+  pair<KeysOutputIt, ValuesOutputIt>
+  reduce_by_key(execution_policy<Derived>& policy,
+                KeysInputIt                keys_first,
+                KeysInputIt                keys_last,
+                ValuesInputIt              values_first,
+                KeysOutputIt               keys_output,
+                ValuesOutputIt             values_output,
+                EqualityOp                 equality_op,
+                ReductionOp                reduction_op)
+  {
+    using size_type = typename iterator_traits<KeysInputIt>::difference_type;
+
+    size_type num_items = thrust::distance(keys_first, keys_last);
+
+    pair<KeysOutputIt, ValuesOutputIt> result = thrust::make_pair(keys_output, values_output);
+
+    if (num_items == 0)
+    {
+      return result;
+    }
+
+    THRUST_INDEX_TYPE_DISPATCH(result,
+                               reduce_by_key_dispatch,
+                               num_items,
+                               (policy,
+                                keys_first,
+                                num_items_fixed,
+                                values_first,
+                                keys_output,
+                                values_output,
+                                equality_op,
+                                reduction_op));
+
+    return result;
+  }
+
 }    // namespace __reduce_by_key
 
 //-------------------------
 // Thrust API entry points
 //-------------------------
 
-__thrust_exec_check_disable__ 
+__thrust_exec_check_disable__
 template <class Derived,
           class KeyInputIt,
           class ValInputIt,
@@ -1077,35 +1125,27 @@ reduce_by_key(execution_policy<Derived> &policy,
               BinaryPred                 binary_pred,
               BinaryOp                   binary_op)
 {
-  pair<KeyOutputIt, ValOutputIt> ret = thrust::make_pair(keys_output, values_output);
-  if (__THRUST_HAS_CUDART__)
-  {
-    ret = __reduce_by_key::reduce_by_key(policy,
-                                         keys_first,
-                                         keys_last,
-                                         values_first,
-                                         keys_output,
-                                         values_output,
-                                         binary_pred,
-                                         binary_op);
-  }
-  else
-  {
-#if !__THRUST_HAS_CUDART__
-    ret = thrust::reduce_by_key(cvt_to_seq(derived_cast(policy)),
-                                keys_first,
-                                keys_last,
-                                values_first,
-                                keys_output,
-                                values_output,
-                                binary_pred,
-                                binary_op);
-#endif
-  }
+  auto ret = thrust::make_pair(keys_output, values_output);
+  THRUST_CDP_DISPATCH((ret = __reduce_by_key::reduce_by_key(policy,
+                                                            keys_first,
+                                                            keys_last,
+                                                            values_first,
+                                                            keys_output,
+                                                            values_output,
+                                                            binary_pred,
+                                                            binary_op);),
+                      (ret =
+                         thrust::reduce_by_key(cvt_to_seq(derived_cast(policy)),
+                                               keys_first,
+                                               keys_last,
+                                               values_first,
+                                               keys_output,
+                                               values_output,
+                                               binary_pred,
+                                               binary_op);));
   return ret;
 }
 
-
 template <class Derived,
           class KeyInputIt,
           class ValInputIt,
@@ -1161,7 +1201,7 @@ reduce_by_key(execution_policy<Derived> &policy,
 
 } // namespace cuda_
 
-THRUST_END_NS
+THRUST_NAMESPACE_END
 
 #include <thrust/memory.h>
 #include <thrust/reduce.h>
diff --git a/thrust/system/cuda/detail/remove.h b/thrust/system/cuda/detail/remove.h
index 2e252c61d..836d8f5ea 100644
--- a/thrust/system/cuda/detail/remove.h
+++ b/thrust/system/cuda/detail/remove.h
@@ -26,11 +26,12 @@
  ******************************************************************************/
 #pragma once
 
+#include <thrust/detail/config.h>
 
 #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
 #include <thrust/system/cuda/detail/copy_if.h>
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
 namespace cuda_cub {
 
 // in-place
@@ -73,8 +74,9 @@ remove(execution_policy<Derived> &policy,
        InputIt                    last,
        const T &                  value)
 {
-  thrust::detail::equal_to_value<T> pred(value);
-  return cuda_cub::remove_if(policy, first, last, pred);
+  using thrust::placeholders::_1;
+
+  return cuda_cub::remove_if(policy, first, last, _1 == value);
 }
 
 // copy
@@ -128,5 +130,5 @@ remove_copy(execution_policy<Derived> &policy,
 }
 
 }    // namespace cuda_cub
-THRUST_END_NS
+THRUST_NAMESPACE_END
 #endif
diff --git a/thrust/system/cuda/detail/replace.h b/thrust/system/cuda/detail/replace.h
index 3a99dd7c8..af8b8fa95 100644
--- a/thrust/system/cuda/detail/replace.h
+++ b/thrust/system/cuda/detail/replace.h
@@ -26,12 +26,13 @@
  ******************************************************************************/
 #pragma once
 
+#include <thrust/detail/config.h>
 
 #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
 #include <thrust/system/cuda/detail/transform.h>
 #include <thrust/detail/internal_functional.h>
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
 namespace cuda_cub {
 
   namespace __replace
@@ -64,14 +65,14 @@ namespace cuda_cub {
 
       template<class T>
       OutputType THRUST_DEVICE_FUNCTION
-      operator()(T const &x) const
+      operator()(T const &x)
       {
         return pred(x) ? new_value : x;
       }
 
       template<class T, class P>
       OutputType THRUST_DEVICE_FUNCTION
-      operator()(T const &x, P const& y) const
+      operator()(T const &x, P const& y)
       {
         return pred(y) ? new_value : x;
       }
@@ -89,12 +90,14 @@ replace(execution_policy<Derived> &policy,
         T const &                  old_value,
         T const &                  new_value)
 {
+  using thrust::placeholders::_1;
+
   cuda_cub::transform_if(policy,
                       first,
                       last,
                       first,
                       __replace::constant_f<T>(new_value),
-                      thrust::detail::equal_to_value<T>(old_value));
+                      _1 == old_value);
 }
 
 template <class Derived,
@@ -206,5 +209,5 @@ replace_copy(execution_policy<Derived> &policy,
 }
 
 }    // namespace cuda_cub
-THRUST_END_NS
+THRUST_NAMESPACE_END
 #endif
diff --git a/thrust/system/cuda/detail/reverse.h b/thrust/system/cuda/detail/reverse.h
index 4ce432683..7c4cb867e 100644
--- a/thrust/system/cuda/detail/reverse.h
+++ b/thrust/system/cuda/detail/reverse.h
@@ -26,11 +26,12 @@
  ******************************************************************************/
 #pragma once
 
+#include <thrust/detail/config.h>
 
 #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
 #include <thrust/system/cuda/detail/execution_policy.h>
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
 namespace cuda_cub {
 
 template <class Derived, class ItemsIt, class ResultIt>
@@ -47,7 +48,7 @@ reverse(execution_policy<Derived> &policy,
         ItemsIt                    last);
 
 }    // namespace cuda_cub
-THRUST_END_NS
+THRUST_NAMESPACE_END
 
 #include <thrust/advance.h>
 #include <thrust/distance.h>
@@ -55,7 +56,7 @@ THRUST_END_NS
 #include <thrust/system/cuda/detail/copy.h>
 #include <thrust/iterator/reverse_iterator.h>
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
 namespace cuda_cub {
 
 template <class Derived,
@@ -68,8 +69,8 @@ reverse_copy(execution_policy<Derived> &policy,
              ResultIt                   result)
 {
   return cuda_cub::copy(policy,
-                        make_reverse_iterator(last),
-                        make_reverse_iterator(first),
+                        thrust::make_reverse_iterator(last),
+                        thrust::make_reverse_iterator(first),
                         result);
 }
 
@@ -85,12 +86,12 @@ reverse(execution_policy<Derived> &policy,
   // find the midpoint of [first,last)
   difference_type N = thrust::distance(first, last);
   ItemsIt mid(first);
-  advance(mid, N / 2);
+  thrust::advance(mid, N / 2);
 
-  cuda_cub::swap_ranges(policy, first, mid, make_reverse_iterator(last));
+  cuda_cub::swap_ranges(policy, first, mid, thrust::make_reverse_iterator(last));
 }
 
 
 }    // namespace cuda_cub
-THRUST_END_NS
+THRUST_NAMESPACE_END
 #endif
diff --git a/thrust/system/cuda/detail/scan.h b/thrust/system/cuda/detail/scan.h
index d857e4016..fdab8df84 100644
--- a/thrust/system/cuda/detail/scan.h
+++ b/thrust/system/cuda/detail/scan.h
@@ -26,893 +26,331 @@
  ******************************************************************************/
 #pragma once
 
+#include <thrust/detail/config.h>
 
 #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
-#include <thrust/system/cuda/config.h>
-#include <thrust/detail/type_traits.h>
-#include <thrust/functional.h>
-#include <thrust/detail/type_traits/iterator/is_output_iterator.h>
 
-#include <thrust/system/cuda/detail/execution_policy.h>
+#include <thrust/detail/config/exec_check_disable.h>
 #include <thrust/detail/cstdint.h>
-#include <thrust/detail/temporary_array.h>
-#include <thrust/system/cuda/detail/util.h>
-#include <thrust/system/cuda/detail/cub/device/device_scan.cuh>
-#include <thrust/system/cuda/detail/core/agent_launcher.h>
-#include <thrust/system/cuda/detail/par_to_seq.h>
-#include <thrust/detail/mpl/math.h>
-#include <thrust/detail/minmax.h>
+#include <thrust/detail/type_traits.h>
 #include <thrust/distance.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/cuda/config.h>
+#include <thrust/system/cuda/detail/cdp_dispatch.h>
+#include <thrust/system/cuda/detail/dispatch.h>
 
-THRUST_BEGIN_NS
-template <typename DerivedPolicy,
-          typename InputIterator,
-          typename OutputIterator,
-          typename AssociativeOperator>
-__host__ __device__ OutputIterator
-inclusive_scan(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-               InputIterator                                               first,
-               InputIterator                                               last,
-               OutputIterator                                              result,
-               AssociativeOperator                                         binary_op);
-
-template <typename DerivedPolicy,
-          typename InputIterator,
-          typename OutputIterator,
-          typename T,
-          typename AssociativeOperator>
-__host__ __device__ OutputIterator
-exclusive_scan(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-               InputIterator                                               first,
-               InputIterator                                               last,
-               OutputIterator                                              result,
-               T                                                           init,
-               AssociativeOperator                                         binary_op);
-THRUST_END_NS
-
-THRUST_BEGIN_NS
-namespace cuda_cub {
-
-namespace __scan {
-
-  namespace mpl = thrust::detail::mpl::math;
-
-  template<class>
-  struct WarpSize { enum { value = 32 }; };
-
-  template <int                      _BLOCK_THREADS,
-            int                      _ITEMS_PER_THREAD = 1,
-            cub::BlockLoadAlgorithm  _LOAD_ALGORITHM   = cub::BLOCK_LOAD_DIRECT,
-            cub::CacheLoadModifier   _LOAD_MODIFIER    = cub::LOAD_DEFAULT,
-            cub::BlockStoreAlgorithm _STORE_ALGORITHM  = cub::BLOCK_STORE_DIRECT,
-            cub::BlockScanAlgorithm  _SCAN_ALGORITHM   = cub::BLOCK_SCAN_WARP_SCANS,
-            int                      _MIN_BLOCKS       = 1>
-  struct PtxPolicy
-  {
-    enum
-    {
-      BLOCK_THREADS    = _BLOCK_THREADS,
-      ITEMS_PER_THREAD = _ITEMS_PER_THREAD,
-      ITEMS_PER_TILE   = BLOCK_THREADS * ITEMS_PER_THREAD,
-      MIN_BLOCKS       = _MIN_BLOCKS
-    };
-
-    static const cub::BlockLoadAlgorithm  LOAD_ALGORITHM  = _LOAD_ALGORITHM;
-    static const cub::CacheLoadModifier   LOAD_MODIFIER   = _LOAD_MODIFIER;
-    static const cub::BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM;
-    static const cub::BlockScanAlgorithm  SCAN_ALGORITHM  = _SCAN_ALGORITHM;
-  };    // struct PtxPolicy
-
-
-  // Scale the number of warps to keep same amount of "tile" storage
-  // as the nominal configuration for 4B data.  Minimum of two warps.
-  //
-  template<class Arch, int NOMINAL_4B_BLOCK_THREADS, class T>
-  struct THRUST_BLOCK_THREADS
-  {
-    enum
-    {
-      value = mpl::min<int,
-                       NOMINAL_4B_BLOCK_THREADS,
-                       mpl::max<int,
-                                3,
-                                ((NOMINAL_4B_BLOCK_THREADS /
-                                  WarpSize<Arch>::value) *
-                                 4) /
-                                    sizeof(T)>::value *
-                           WarpSize<Arch>::value>::value
-    };
-  }; // struct THRUST_BLOCK_THREADS
-
-  // If necessary, scale down number of items per thread to keep
-  // the same amount of "tile" storage as the nominal configuration for 4B data.
-  // Minimum 1 item per thread
-  //
-  template <class Arch,
-            int NOMINAL_4B_ITEMS_PER_THREAD,
-            int NOMINAL_4B_BLOCK_THREADS,
-            class T>
-  struct THRUST_ITEMS_PER_THREAD
-  {
-    enum
-    {
-      value = mpl::min<
-          int,
-          NOMINAL_4B_ITEMS_PER_THREAD,
-          mpl::max<
-              int,
-              1,
-              (NOMINAL_4B_ITEMS_PER_THREAD *
-               NOMINAL_4B_BLOCK_THREADS * 4 / sizeof(T)) /
-                  THRUST_BLOCK_THREADS<Arch,
-                                       NOMINAL_4B_BLOCK_THREADS,
-                                       T>::value>::value>::value
-    };
-  };
-
-
-  template <class Arch, class T, class U>
-  struct Tuning;
-  
-  template<class T, class U>
-  struct Tuning<sm30,T,U>
-  {
-    typedef sm30 Arch;
-    enum
-    {
-      NOMINAL_4B_BLOCK_THREADS    = 256,
-      NOMINAL_4B_ITEMS_PER_THREAD = 9,
-    };
-
-    typedef PtxPolicy<THRUST_BLOCK_THREADS<Arch,
-                                           NOMINAL_4B_BLOCK_THREADS,
-                                           T>::value,
-                      THRUST_ITEMS_PER_THREAD<Arch,
-                                              NOMINAL_4B_ITEMS_PER_THREAD,
-                                              NOMINAL_4B_BLOCK_THREADS,
-                                              T>::value,
-                      cub::BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED,
-                      cub::LOAD_DEFAULT,
-                      cub::BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED,
-                      cub::BLOCK_SCAN_RAKING_MEMOIZE>
-        type;
-  };    // struct Tuning for sm30
-  
-  template<class T, class U>
-  struct Tuning<sm35,T,U>
-  {
-    typedef sm35 Arch;
-    enum
-    {
-      NOMINAL_4B_BLOCK_THREADS    = 128,
-      NOMINAL_4B_ITEMS_PER_THREAD = 12,
-    };
-
-    typedef PtxPolicy<THRUST_BLOCK_THREADS<Arch,
-                                           NOMINAL_4B_BLOCK_THREADS,
-                                           T>::value,
-                      THRUST_ITEMS_PER_THREAD<Arch,
-                                              NOMINAL_4B_ITEMS_PER_THREAD,
-                                              NOMINAL_4B_BLOCK_THREADS,
-                                              T>::value,
-                      cub::BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED,
-                      cub::LOAD_LDG,
-                      cub::BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED,
-                      cub::BLOCK_SCAN_RAKING>
-        type;
-  };    // struct Tuning for sm35
-  
-  template<class T, class U>
-  struct Tuning<sm52,T,U>
-  {
-    typedef sm52 Arch;
-    enum
-    {
-      NOMINAL_4B_BLOCK_THREADS    = 128,
-      NOMINAL_4B_ITEMS_PER_THREAD = 12,
-    };
-
-    typedef PtxPolicy<THRUST_BLOCK_THREADS<Arch,
-                                           NOMINAL_4B_BLOCK_THREADS,
-                                           T>::value,
-                      THRUST_ITEMS_PER_THREAD<Arch,
-                                              NOMINAL_4B_ITEMS_PER_THREAD,
-                                              NOMINAL_4B_BLOCK_THREADS,
-                                              T>::value,
-                      cub::BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED,
-                      cub::LOAD_LDG,
-                      cub::BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED,
-                      cub::BLOCK_SCAN_RAKING>
-        type;
-  };    // struct Tuning for sm52
-
-  template <class InputIt,
-            class OutputIt,
-            class ScanOp,
-            class Size,
-            class T,
-            class Inclusive>
-  struct ScanAgent
-  {
-    typedef cub::ScanTileState<T> ScanTileState;
-    typedef cub::BlockScanRunningPrefixOp<T, ScanOp> RunningPrefixCallback;
-
-    template<class Arch>
-    struct PtxPlan : Tuning<Arch,T,T>::type
-    {
-      typedef Tuning<Arch, T, T> tuning;
-
-
-      typedef typename core::LoadIterator<PtxPlan, InputIt>::type LoadIt;
-      typedef typename core::BlockLoad<PtxPlan, LoadIt, T>::type    BlockLoad;
-      typedef typename core::BlockStore<PtxPlan, OutputIt, T>::type BlockStore;
-
-      typedef cub::TilePrefixCallbackOp<T, ScanOp, ScanTileState, Arch::ver>
-          TilePrefixCallback;
-      typedef cub::BlockScan<T,
-                             PtxPlan::BLOCK_THREADS,
-                             PtxPlan::SCAN_ALGORITHM,
-                             1,
-                             1,
-                             Arch::ver>
-          BlockScan;
-
-      union TempStorage
-      {
-        typename BlockLoad::TempStorage  load;
-        typename BlockStore::TempStorage store;
-
-        struct
-        {
-          typename TilePrefixCallback::TempStorage prefix;
-          typename BlockScan::TempStorage          scan;
-        };
-      };    // struct TempStorage
-    };    // struct PtxPlan
-    typedef typename core::specialize_plan_msvc10_war<PtxPlan>::type::type ptx_plan;
-
-    typedef typename ptx_plan::LoadIt             LoadIt;
-    typedef typename ptx_plan::BlockLoad          BlockLoad;
-    typedef typename ptx_plan::BlockStore         BlockStore;
-    typedef typename ptx_plan::TilePrefixCallback TilePrefixCallback;
-    typedef typename ptx_plan::BlockScan          BlockScan;
-    typedef typename ptx_plan::TempStorage        TempStorage;
-
-    enum
-    {
-      INCLUSIVE        = Inclusive::value,
-      BLOCK_THREADS    = ptx_plan::BLOCK_THREADS,
-      ITEMS_PER_THREAD = ptx_plan::ITEMS_PER_THREAD,
-      ITEMS_PER_TILE   = ptx_plan::ITEMS_PER_TILE,
-
-      SYNC_AFTER_LOAD = (ptx_plan::LOAD_ALGORITHM != cub::BLOCK_LOAD_DIRECT),
-    };
-
-    struct impl
-    {
-      //---------------------------------------------------------------------
-      // Per thread data
-      //---------------------------------------------------------------------
-
-      TempStorage &storage;
-      ScanTileState &tile_state;
-      LoadIt load_it;
-      OutputIt output_it;
-      ScanOp scan_op;
-
-      //---------------------------------------------------------------------
-      // Block scan utility methods (first tile)
-      //---------------------------------------------------------------------
-
-      // Exclusive scan specialization
-      //
-      template <class _ScanOp>
-      void THRUST_DEVICE_FUNCTION scan_tile(T (&items)[ITEMS_PER_THREAD],
-                                            _ScanOp scan_op,
-                                            T &     block_aggregate,
-                                            thrust::detail::false_type /* is_inclusive */)
-      {
-        BlockScan(storage.scan).ExclusiveScan(items, items, scan_op, block_aggregate);
-      }
-
-      // Exclusive sum specialization
-      //
-      void THRUST_DEVICE_FUNCTION scan_tile(T (&items)[ITEMS_PER_THREAD],
-                                            plus<T> /*scan_op*/,
-                                            T &     block_aggregate,
-                                            thrust::detail::false_type /* is_inclusive */)
-      {
-        BlockScan(storage.scan).ExclusiveSum(items, items, block_aggregate);
-      }
-
-      // Inclusive scan specialization
-      //
-      template <typename _ScanOp>
-      void THRUST_DEVICE_FUNCTION scan_tile(T (&items)[ITEMS_PER_THREAD],
-                                            _ScanOp scan_op,
-                                            T &     block_aggregate,
-                                            thrust::detail::true_type /* is_inclusive */)
-      {
-        BlockScan(storage.scan).InclusiveScan(items, items, scan_op, block_aggregate);
-      }
-
-
-      // Inclusive sum specialization
-      //
-      void THRUST_DEVICE_FUNCTION scan_tile(T (&items)[ITEMS_PER_THREAD],
-                                            plus<T> /*scan_op*/,
-                                            T &     block_aggregate,
-                                            thrust::detail::true_type /* is_inclusive */)
-      {
-        BlockScan(storage.scan).InclusiveSum(items, items, block_aggregate);
-      }
-
-      //---------------------------------------------------------------------
-      // Block scan utility methods (subsequent tiles)
-      //---------------------------------------------------------------------
-
-      // Exclusive scan specialization (with prefix from predecessors)
-      //
-      template <class _ScanOp, class PrefixCallback>
-      void THRUST_DEVICE_FUNCTION scan_tile(T (&items)[ITEMS_PER_THREAD],
-                                            _ScanOp         scan_op,
-                                            T &             block_aggregate,
-                                            PrefixCallback &prefix_op,
-                                            thrust::detail::false_type /* is_inclusive */)
-      {
-        BlockScan(storage.scan).ExclusiveScan(items, items, scan_op, prefix_op);
-        block_aggregate = prefix_op.GetBlockAggregate();
-      }
-  
-      // Exclusive sum specialization (with prefix from predecessors)
-      //
-      template <class PrefixCallback>
-      THRUST_DEVICE_FUNCTION void scan_tile(T (&items)[ITEMS_PER_THREAD],
-                                            plus<T>         /*scan_op*/,
-                                            T &             block_aggregate,
-                                            PrefixCallback &prefix_op,
-                                            thrust::detail::false_type /* is_inclusive */)
-      {
-        BlockScan(storage.scan).ExclusiveSum(items, items, prefix_op);
-        block_aggregate = prefix_op.GetBlockAggregate();
-      }
-
-      // Inclusive scan specialization (with prefix from predecessors)
-      //
-      template <class _ScanOp, class PrefixCallback>
-      THRUST_DEVICE_FUNCTION void scan_tile(T (&items)[ITEMS_PER_THREAD],
-                                            _ScanOp         scan_op,
-                                            T &             block_aggregate,
-                                            PrefixCallback &prefix_op,
-                                            thrust::detail::true_type /* is_inclusive */)
-      {
-        BlockScan(storage.scan).InclusiveScan(items, items, scan_op, prefix_op);
-        block_aggregate = prefix_op.GetBlockAggregate();
-      }
-
-      // Inclusive sum specialization (with prefix from predecessors)
-      //
-      template <class U, class PrefixCallback>
-      THRUST_DEVICE_FUNCTION void scan_tile(T (&items)[ITEMS_PER_THREAD],
-                                            plus<T>         /*scan_op*/,
-                                            T &             block_aggregate,
-                                            PrefixCallback &prefix_op,
-                                            thrust::detail::true_type /* is_inclusive */)
-      {
-        BlockScan(storage.scan).InclusiveSum(items, items, prefix_op);
-        block_aggregate = prefix_op.GetBlockAggregate();
-      }
-
-      //---------------------------------------------------------------------
-      // Cooperatively scan a device-wide sequence of tiles with other CTAs
-      //---------------------------------------------------------------------
-
-      // Process a tile of input (dynamic chained scan)
-      //
-      template <bool IS_FULL_TILE, class AddInitToExclusive>
-      THRUST_DEVICE_FUNCTION void
-      consume_tile(Size               /*num_items*/,
-                   Size               num_remaining,
-                   int                tile_idx,
-                   Size               tile_base,
-                   AddInitToExclusive add_init_to_exclusive_scan)
-      {
-        using core::sync_threadblock;
-
-        // Load items
-        T items[ITEMS_PER_THREAD];
-
-        if (IS_FULL_TILE)
-        {
-          BlockLoad(storage.load).Load(load_it + tile_base, items);
-        }
-        else
-        {
-          // Fill last element with the first element
-          // because collectives are not suffix guarded
-          BlockLoad(storage.load)
-              .Load(load_it + tile_base,
-                    items,
-                    num_remaining,
-                    *(load_it + tile_base));
-        }
-
-        if (SYNC_AFTER_LOAD)
-          sync_threadblock();
-
-        // Perform tile scan
-        if (tile_idx == 0)
-        {
-          // Scan first tile
-          T block_aggregate;
-          scan_tile(items, scan_op, block_aggregate, Inclusive());
-
-          // Update tile status if there may be successor tiles (i.e., this tile is full)
-          if (IS_FULL_TILE && (threadIdx.x == 0))
-            tile_state.SetInclusive(0, block_aggregate);
-        }
-        else
-        {
-          // Scan non-first tile
-          T                  block_aggregate;
-          TilePrefixCallback prefix_op(tile_state, storage.prefix, scan_op, tile_idx);
-          scan_tile(items, scan_op, block_aggregate, prefix_op, Inclusive());
-        }
-
-        sync_threadblock();
-
-        add_init_to_exclusive_scan(items, tile_idx);
-
-        // Store items
-        if (IS_FULL_TILE)
-        {
-          BlockStore(storage.store).Store(output_it + tile_base, items);
-        }
-        else
-        {
-          BlockStore(storage.store).Store(output_it + tile_base, items, num_remaining);
-        }
-      }
-      
-
-      //---------------------------------------------------------------------
-      // Constructor
-      //---------------------------------------------------------------------
-      
-      // Dequeue and scan tiles of items as part of a dynamic chained scan
-      // with Init
-      template <class AddInitToExclusiveScan>
-      THRUST_DEVICE_FUNCTION
-      impl(TempStorage &          storage_,
-           ScanTileState &        tile_state_,
-           InputIt                input_it,
-           OutputIt               output_it_,
-           ScanOp                 scan_op_,
-           Size                   num_items,
-           AddInitToExclusiveScan add_init_to_exclusive_scan)
-          : storage(storage_),
-            tile_state(tile_state_),
-            load_it(core::make_load_iterator(ptx_plan(), input_it)),
-            output_it(output_it_),
-            scan_op(scan_op_)
-      {
-        int  tile_idx      = blockIdx.x;
-        Size tile_base     = ITEMS_PER_TILE * tile_idx;
-        Size num_remaining = num_items - tile_base;
-
-        if (num_remaining > ITEMS_PER_TILE)
-        {
-          // Full tile
-          consume_tile<true>(num_items,
-                             num_remaining,
-                             tile_idx,
-                             tile_base,
-                             add_init_to_exclusive_scan);
-        }
-        else if (num_remaining > 0)
-        {
-          // Partially-full tile
-          consume_tile<false>(num_items,
-                              num_remaining,
-                              tile_idx,
-                              tile_base,
-                              add_init_to_exclusive_scan);
-        }
-      }
-    };    // struct impl
-
-    //---------------------------------------------------------------------
-    // Agent entry point
-    //---------------------------------------------------------------------
-
-    template <class AddInitToExclusiveScan>
-    THRUST_AGENT_ENTRY(InputIt                input_it,
-                       OutputIt               output_it,
-                       ScanOp                 scan_op,
-                       Size                   num_items,
-                       ScanTileState          tile_state,
-                       AddInitToExclusiveScan add_init_to_exclusive_scan,
-                       char *                 shmem)
-    {
-      TempStorage &storage = *reinterpret_cast<TempStorage *>(shmem);
-      impl(storage,
-           tile_state,
-           input_it,
-           output_it,
-           scan_op,
-           num_items,
-           add_init_to_exclusive_scan);
-    }
-  };    // struct ScanAgent
-
-  template <class ScanTileState,
-            class Size>
-  struct InitAgent
-  {
-    template <class Arch>
-    struct PtxPlan : PtxPolicy<128> {};
-   
-    typedef core::specialize_plan<PtxPlan> ptx_plan;
-
-    //---------------------------------------------------------------------
-    // Agent entry point
-    //---------------------------------------------------------------------
-
-    THRUST_AGENT_ENTRY(ScanTileState tile_state,
-                       Size          num_tiles,
-                       char *        /*shmem*/)
-    {
-      tile_state.InitializeStatus(num_tiles);
-    }
+#include <cub/device/device_scan.cuh>
 
-  }; // struct InitAgent
+THRUST_NAMESPACE_BEGIN
+namespace cuda_cub
+{
+namespace detail
+{
 
-  template<class T>
-  struct DoNothing
+__thrust_exec_check_disable__
+template <typename Derived,
+          typename InputIt,
+          typename Size,
+          typename OutputIt,
+          typename ScanOp>
+__host__ __device__
+OutputIt inclusive_scan_n_impl(thrust::cuda_cub::execution_policy<Derived> &policy,
+                               InputIt first,
+                               Size num_items,
+                               OutputIt result,
+                               ScanOp scan_op)
+{
+  using AccumT = typename thrust::iterator_traits<InputIt>::value_type;
+  using Dispatch32 = cub::DispatchScan<InputIt,
+                                       OutputIt,
+                                       ScanOp,
+                                       cub::NullType,
+                                       thrust::detail::int32_t,
+                                       AccumT>;
+  using Dispatch64 = cub::DispatchScan<InputIt,
+                                       OutputIt,
+                                       ScanOp,
+                                       cub::NullType,
+                                       thrust::detail::int64_t,
+                                       AccumT>;
+
+  cudaStream_t stream = thrust::cuda_cub::stream(policy);
+  cudaError_t status;
+
+  // Determine temporary storage requirements:
+  size_t tmp_size = 0;
   {
-    typedef T     type;
-    template <int ITEMS_PER_THREAD>
-    THRUST_DEVICE_FUNCTION void
-    operator()(T (&items)[ITEMS_PER_THREAD], int /*tile_idx*/)
-    {
-      THRUST_UNUSED_VAR(items);
-    }
-  };    // struct DoNothing
+    THRUST_INDEX_TYPE_DISPATCH2(status,
+                                Dispatch32::Dispatch,
+                                Dispatch64::Dispatch,
+                                num_items,
+                                (nullptr,
+                                 tmp_size,
+                                 first,
+                                 result,
+                                 scan_op,
+                                 cub::NullType{},
+                                 num_items_fixed,
+                                 stream));
+    thrust::cuda_cub::throw_on_error(status,
+                                     "after determining tmp storage "
+                                     "requirements for inclusive_scan");
+  }
 
-  template<class T, class ScanOp>
-  struct AddInitToExclusiveScan
+  // Run scan:
   {
-    typedef T type;
-    T         init;
-    ScanOp    scan_op;
-
-    THRUST_RUNTIME_FUNCTION
-    AddInitToExclusiveScan(T init_, ScanOp scan_op_)
-        : init(init_), scan_op(scan_op_) {}
+    // Allocate temporary storage:
+    thrust::detail::temporary_array<thrust::detail::uint8_t, Derived> tmp{
+      policy,
+      tmp_size};
+    THRUST_INDEX_TYPE_DISPATCH2(status,
+                                Dispatch32::Dispatch,
+                                Dispatch64::Dispatch,
+                                num_items,
+                                (tmp.data().get(),
+                                 tmp_size,
+                                 first,
+                                 result,
+                                 scan_op,
+                                 cub::NullType{},
+                                 num_items_fixed,
+                                 stream));
+    thrust::cuda_cub::throw_on_error(status,
+                                     "after dispatching inclusive_scan kernel");
+    thrust::cuda_cub::throw_on_error(thrust::cuda_cub::synchronize_optional(policy),
+                                     "inclusive_scan failed to synchronize");
+  }
 
-    template <int ITEMS_PER_THREAD>
-    THRUST_DEVICE_FUNCTION void
-    operator()(T (&items)[ITEMS_PER_THREAD], int tile_idx)
-    {
-      if (tile_idx == 0 && threadIdx.x == 0)
-      {
-        items[0] = init;
-        for (int i = 1; i < ITEMS_PER_THREAD; ++i)
-          items[i] = scan_op(init, items[i]);
-      }
-      else
-      {
-        for (int i = 0; i < ITEMS_PER_THREAD; ++i)
-          items[i] = scan_op(init, items[i]);
-      }
-    }
-  };    // struct AddInitToExclusiveScan
+  return result + num_items;
+}
 
-  template <class Inclusive,
-            class InputIt,
-            class OutputIt,
-            class ScanOp,
-            class Size,
-            class AddInitToExclusiveScan>
-  static cudaError_t THRUST_RUNTIME_FUNCTION
-  doit_step(void *                 d_temp_storage,
-            size_t &               temp_storage_bytes,
-            InputIt                input_it,
-            Size                   num_items,
-            AddInitToExclusiveScan add_init_to_exclusive_scan,
-            OutputIt               output_it,
-            ScanOp                 scan_op,
-            cudaStream_t           stream,
-            bool                   debug_sync)
+__thrust_exec_check_disable__
+template <typename Derived,
+          typename InputIt,
+          typename Size,
+          typename OutputIt,
+          typename InitValueT,
+          typename ScanOp>
+__host__ __device__
+OutputIt exclusive_scan_n_impl(thrust::cuda_cub::execution_policy<Derived> &policy,
+                               InputIt first,
+                               Size num_items,
+                               OutputIt result,
+                               InitValueT init,
+                               ScanOp scan_op)
+{
+  using InputValueT = cub::detail::InputValue<InitValueT>;
+  using Dispatch32 = cub::DispatchScan<InputIt,
+                                       OutputIt,
+                                       ScanOp,
+                                       InputValueT,
+                                       thrust::detail::int32_t,
+                                       InitValueT>;
+  using Dispatch64 = cub::DispatchScan<InputIt,
+                                       OutputIt,
+                                       ScanOp,
+                                       InputValueT,
+                                       thrust::detail::int64_t,
+                                       InitValueT>;
+
+  cudaStream_t stream = thrust::cuda_cub::stream(policy);
+  cudaError_t status;
+
+  // Determine temporary storage requirements:
+  size_t tmp_size = 0;
   {
-    using core::AgentPlan;
-    using core::AgentLauncher;
-
-    cudaError_t status = cudaSuccess;
-    if (num_items == 0)
-      return cudaErrorNotSupported;
-
-    typedef typename AddInitToExclusiveScan::type T;
-
-    typedef AgentLauncher<
-        ScanAgent<InputIt, OutputIt, ScanOp, Size, T, Inclusive> >
-        scan_agent;
-
-    typedef typename scan_agent::ScanTileState ScanTileState;
-
-    typedef AgentLauncher<InitAgent<ScanTileState, Size> > init_agent;
-
-    AgentPlan scan_plan = scan_agent::get_plan(stream);
-    AgentPlan init_plan = init_agent::get_plan();
-
-    int tile_size = scan_plan.items_per_tile;
-    Size num_tiles = static_cast<Size>((num_items + tile_size - 1) / tile_size);
-
-    size_t vshmem_size = core::vshmem_size(scan_plan.shared_memory_size,
-                                           num_tiles);
-
-    size_t allocation_sizes[2] = {0, vshmem_size};
-    status = ScanTileState::AllocationSize(static_cast<int>(num_tiles), allocation_sizes[0]);
-    CUDA_CUB_RET_IF_FAIL(status);
-
-    void* allocations[2] = {NULL, NULL};
-
-    status = core::alias_storage(d_temp_storage,
-                                 temp_storage_bytes,
-                                 allocations,
-                                 allocation_sizes);
-    CUDA_CUB_RET_IF_FAIL(status);
-
-    if (d_temp_storage == NULL)
-    {
-      return status;
-    }
-    
-    ScanTileState tile_state;
-    status = tile_state.Init(static_cast<int>(num_tiles), allocations[0], allocation_sizes[0]);
-    CUDA_CUB_RET_IF_FAIL(status);
-
-    char *vshmem_ptr = vshmem_size > 0 ? (char*)allocations[1] : NULL;
-    
-    init_agent ia(init_plan, num_tiles, stream, "scan::init_agent", debug_sync);
-    ia.launch(tile_state, num_tiles);
-    CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
-
-    scan_agent sa(scan_plan, num_items, stream, vshmem_ptr, "scan::scan_agent", debug_sync);
-    sa.launch(input_it,
-              output_it,
-              scan_op,
-              num_items,
-              tile_state,
-              add_init_to_exclusive_scan);
-    CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
-    return status;
-  }    // func doit_step
+    THRUST_INDEX_TYPE_DISPATCH2(status,
+                                Dispatch32::Dispatch,
+                                Dispatch64::Dispatch,
+                                num_items,
+                                (nullptr,
+                                 tmp_size,
+                                 first,
+                                 result,
+                                 scan_op,
+                                 InputValueT(init),
+                                 num_items_fixed,
+                                 stream));
+    thrust::cuda_cub::throw_on_error(status,
+                                     "after determining tmp storage "
+                                     "requirements for exclusive_scan");
+  }
 
-  template <typename Inclusive,
-            typename Derived,
-            typename InputIt,
-            typename OutputIt,
-            typename Size,
-            typename ScanOp,
-            typename AddInitToExclusiveScan>
-  THRUST_RUNTIME_FUNCTION
-  OutputIt scan(execution_policy<Derived>& policy,
-                InputIt                    input_it,
-                OutputIt                   output_it,
-                Size                       num_items,
-                ScanOp                     scan_op,
-                AddInitToExclusiveScan     add_init_to_exclusive_scan)
+  // Run scan:
   {
-    if (num_items == 0)
-      return output_it;
-
-    size_t       storage_size = 0;
-    cudaStream_t stream       = cuda_cub::stream(policy);
-    bool         debug_sync   = THRUST_DEBUG_SYNC_FLAG;
-
-    cudaError_t status;
-    status = doit_step<Inclusive>(NULL,
-                                  storage_size,
-                                  input_it,
-                                  num_items,
-                                  add_init_to_exclusive_scan,
-                                  output_it,
-                                  scan_op,
-                                  stream,
-                                  debug_sync);
-    cuda_cub::throw_on_error(status, "scan failed on 1st step");
-
-    // Allocate temporary storage.
-    thrust::detail::temporary_array<thrust::detail::uint8_t, Derived>
-      tmp(policy, storage_size);
-    void *ptr = static_cast<void*>(tmp.data().get());
-
-    status = doit_step<Inclusive>(ptr,
-                                  storage_size,
-                                  input_it,
-                                  num_items,
-                                  add_init_to_exclusive_scan,
-                                  output_it,
-                                  scan_op,
-                                  stream,
-                                  debug_sync);
-    cuda_cub::throw_on_error(status, "scan failed on 2nd step");
-
-    status = cuda_cub::synchronize(policy);
-    cuda_cub::throw_on_error(status, "scan failed to synchronize");
+    // Allocate temporary storage:
+    thrust::detail::temporary_array<thrust::detail::uint8_t, Derived> tmp{
+      policy,
+      tmp_size};
+    THRUST_INDEX_TYPE_DISPATCH2(status,
+                                Dispatch32::Dispatch,
+                                Dispatch64::Dispatch,
+                                num_items,
+                                (tmp.data().get(),
+                                 tmp_size,
+                                 first,
+                                 result,
+                                 scan_op,
+                                 InputValueT(init),
+                                 num_items_fixed,
+                                 stream));
+    thrust::cuda_cub::throw_on_error(status,
+                                     "after dispatching exclusive_scan kernel");
+    thrust::cuda_cub::throw_on_error(thrust::cuda_cub::synchronize_optional(policy),
+                                     "exclusive_scan failed to synchronize");
+  }
 
-    return output_it + num_items;
-  }    // func scan
+  return result + num_items;
+}
 
-}    // namespace __scan
+} // namespace detail
 
 //-------------------------
 // Thrust API entry points
 //-------------------------
 
 __thrust_exec_check_disable__
-template <class Derived,
-          class InputIt,
-          class Size,
-          class OutputIt,
-          class ScanOp>
-OutputIt __host__ __device__
-inclusive_scan_n(execution_policy<Derived> &policy,
-                 InputIt                    first,
-                 Size                       num_items,
-                 OutputIt                   result,
-                 ScanOp                     scan_op)
+template <typename Derived,
+          typename InputIt,
+          typename Size,
+          typename OutputIt,
+          typename ScanOp>
+__host__ __device__
+OutputIt inclusive_scan_n(thrust::cuda_cub::execution_policy<Derived> &policy,
+                          InputIt first,
+                          Size num_items,
+                          OutputIt result,
+                          ScanOp scan_op)
 {
-  OutputIt ret = result;
-  if (__THRUST_HAS_CUDART__)
-  {
-    typedef typename iterator_traits<InputIt>::value_type T;
-    ret = __scan::scan<thrust::detail::true_type>(policy,
-                                                  first,
-                                                  result,
-                                                  num_items,
-                                                  scan_op,
-                                                  __scan::DoNothing<T>());
-  }
-  else
-  {
-#if !__THRUST_HAS_CUDART__
-    ret = thrust::inclusive_scan(cvt_to_seq(derived_cast(policy)),
-                                 first,
-                                 first + num_items,
-                                 result,
-                                 scan_op);
-#endif
-  }
-  return ret;
+  THRUST_CDP_DISPATCH(
+    (result = thrust::cuda_cub::detail::inclusive_scan_n_impl(policy,
+                                                              first,
+                                                              num_items,
+                                                              result,
+                                                              scan_op);),
+    (result = thrust::inclusive_scan(cvt_to_seq(derived_cast(policy)),
+                                     first,
+                                     first + num_items,
+                                     result,
+                                     scan_op);));
+  return result;
 }
 
-
-template <class Derived,
-          class InputIt,
-          class OutputIt,
-          class ScanOp>
-OutputIt __host__ __device__
-inclusive_scan(execution_policy<Derived> &policy,
-               InputIt                    first,
-               InputIt                    last,
-               OutputIt                   result,
-               ScanOp                     scan_op)
+template <typename Derived, typename InputIt, typename OutputIt, typename ScanOp>
+__host__ __device__
+OutputIt inclusive_scan(thrust::cuda_cub::execution_policy<Derived> &policy,
+                        InputIt first,
+                        InputIt last,
+                        OutputIt result,
+                        ScanOp scan_op)
 {
-  int num_items = static_cast<int>(thrust::distance(first, last));
-  return cuda_cub::inclusive_scan_n(policy, first, num_items, result, scan_op);
+  using diff_t = typename thrust::iterator_traits<InputIt>::difference_type;
+  diff_t const num_items = thrust::distance(first, last);
+  return thrust::cuda_cub::inclusive_scan_n(policy,
+                                            first,
+                                            num_items,
+                                            result,
+                                            scan_op);
 }
 
-
-template <class Derived,
-          class InputIt,
-          class OutputIt>
-OutputIt __host__ __device__
-inclusive_scan(execution_policy<Derived> &policy,
-               InputIt                    first,
-               OutputIt                   last,
-               OutputIt                   result)
+template <typename Derived, typename InputIt, typename OutputIt>
+__host__ __device__
+OutputIt inclusive_scan(thrust::cuda_cub::execution_policy<Derived> &policy,
+                        InputIt first,
+                        InputIt last,
+                        OutputIt result)
 {
-
-  typedef typename thrust::detail::eval_if<
-      thrust::detail::is_output_iterator<OutputIt>::value,
-      thrust::iterator_value<InputIt>,
-      thrust::iterator_value<OutputIt> >::type result_type;
-  return cuda_cub::inclusive_scan(policy, first, last, result, plus<result_type>());
-};
+  return thrust::cuda_cub::inclusive_scan(policy,
+                                          first,
+                                          last,
+                                          result,
+                                          thrust::plus<>{});
+}
 
 __thrust_exec_check_disable__
-template <class Derived,
-          class InputIt,
-          class Size,
-          class OutputIt,
-          class T,
-          class ScanOp>
-OutputIt __host__ __device__
-exclusive_scan_n(execution_policy<Derived> &policy,
-                 InputIt                    first,
-                 Size                       num_items,
-                 OutputIt                   result,
-                 T                          init,
-                 ScanOp                     scan_op)
+template <typename Derived,
+          typename InputIt,
+          typename Size,
+          typename OutputIt,
+          typename T,
+          typename ScanOp>
+__host__ __device__
+OutputIt exclusive_scan_n(thrust::cuda_cub::execution_policy<Derived> &policy,
+                          InputIt first,
+                          Size num_items,
+                          OutputIt result,
+                          T init,
+                          ScanOp scan_op)
 {
-  OutputIt ret = result;
-  if (__THRUST_HAS_CUDART__)
-  {
-    ret = __scan::scan<thrust::detail::false_type>(
-        policy,
-        first,
-        result,
-        num_items,
-        scan_op,
-        __scan::AddInitToExclusiveScan<T, ScanOp>(init, scan_op));
-  }
-  else
-  {
-#if !__THRUST_HAS_CUDART__
-    ret = thrust::exclusive_scan(cvt_to_seq(derived_cast(policy)),
-                                 first,
-                                 first + num_items,
-                                 result,
-                                 init,
-                                 scan_op);
-#endif
-  }
-  return ret;
+  THRUST_CDP_DISPATCH(
+    (result = thrust::cuda_cub::detail::exclusive_scan_n_impl(policy,
+                                                              first,
+                                                              num_items,
+                                                              result,
+                                                              init,
+                                                              scan_op);),
+    (result = thrust::exclusive_scan(cvt_to_seq(derived_cast(policy)),
+                                     first,
+                                     first + num_items,
+                                     result,
+                                     init,
+                                     scan_op);));
+  return result;
 }
 
-template <class Derived,
-          class InputIt,
-          class OutputIt,
-          class T,
-          class ScanOp>
-OutputIt __host__ __device__
-exclusive_scan(execution_policy<Derived> &policy,
-               InputIt                    first,
-               InputIt                    last,
-               OutputIt                   result,
-               T                          init,
-               ScanOp                   scan_op)
+template <typename Derived,
+          typename InputIt,
+          typename OutputIt,
+          typename T,
+          typename ScanOp>
+__host__ __device__
+OutputIt exclusive_scan(thrust::cuda_cub::execution_policy<Derived> &policy,
+                        InputIt first,
+                        InputIt last,
+                        OutputIt result,
+                        T init,
+                        ScanOp scan_op)
 {
-  int num_items = static_cast<int>(thrust::distance(first, last));
-  return cuda_cub::exclusive_scan_n(policy, first, num_items, result, init, scan_op);
+  using diff_t = typename thrust::iterator_traits<InputIt>::difference_type;
+  diff_t const num_items = thrust::distance(first, last);
+  return thrust::cuda_cub::exclusive_scan_n(policy,
+                                            first,
+                                            num_items,
+                                            result,
+                                            init,
+                                            scan_op);
 }
 
-template <class Derived,
-          class InputIt,
-          class OutputIt,
-          class T>
-OutputIt __host__ __device__
-exclusive_scan(execution_policy<Derived> &policy,
-               InputIt                    first,
-               OutputIt                   last,
-               OutputIt                   result,
-               T                          init)
+template <typename Derived, typename InputIt, typename OutputIt, typename T>
+__host__ __device__
+OutputIt exclusive_scan(thrust::cuda_cub::execution_policy<Derived> &policy,
+                        InputIt first,
+                        InputIt last,
+                        OutputIt result,
+                        T init)
 {
-  return cuda_cub::exclusive_scan(policy, first, last, result, init, plus<T>());
+  return thrust::cuda_cub::exclusive_scan(policy,
+                                          first,
+                                          last,
+                                          result,
+                                          init,
+                                          thrust::plus<>{});
 }
 
-template <class Derived,
-          class InputIt,
-          class OutputIt>
-OutputIt __host__ __device__
-exclusive_scan(execution_policy<Derived> &policy,
-               InputIt                    first,
-               OutputIt                   last,
-               OutputIt                   result)
+template <typename Derived, typename InputIt, typename OutputIt>
+__host__ __device__
+OutputIt exclusive_scan(thrust::cuda_cub::execution_policy<Derived> &policy,
+                        InputIt first,
+                        InputIt last,
+                        OutputIt result)
 {
-  typedef typename thrust::detail::eval_if<
-      thrust::detail::is_output_iterator<OutputIt>::value,
-      thrust::iterator_value<InputIt>,
-      thrust::iterator_value<OutputIt>
-  >::type result_type;
-  return cuda_cub::exclusive_scan(policy, first, last, result, result_type(0));
+  using init_type = typename thrust::iterator_traits<InputIt>::value_type;
+  return cuda_cub::exclusive_scan(policy, first, last, result, init_type{});
 };
 
 } // namespace cuda_cub
-THRUST_END_NS
+THRUST_NAMESPACE_END
 
 #include <thrust/scan.h>
 
-#endif
+#endif // NVCC
diff --git a/thrust/system/cuda/detail/scan_by_key.h b/thrust/system/cuda/detail/scan_by_key.h
index b88445110..0407779c6 100644
--- a/thrust/system/cuda/detail/scan_by_key.h
+++ b/thrust/system/cuda/detail/scan_by_key.h
@@ -26,759 +26,263 @@
  ******************************************************************************/
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+
+#include <thrust/distance.h>
+#include <thrust/functional.h>
+
+#include <thrust/type_traits/is_contiguous_iterator.h>
+
+#include <thrust/iterator/iterator_traits.h>
+
 #include <thrust/detail/cstdint.h>
+#include <thrust/detail/minmax.h>
+#include <thrust/detail/mpl/math.h>
 #include <thrust/detail/temporary_array.h>
-#include <thrust/system/cuda/detail/util.h>
 
-#include <thrust/system/cuda/execution_policy.h>
+#include <thrust/system/cuda/config.h>
+#include <thrust/system/cuda/detail/cdp_dispatch.h>
+#include <thrust/system/cuda/detail/dispatch.h>
 #include <thrust/system/cuda/detail/par_to_seq.h>
-#include <thrust/system/cuda/detail/core/agent_launcher.h>
-#include <thrust/detail/mpl/math.h>
-#include <thrust/detail/minmax.h>
-#include <thrust/distance.h>
+#include <thrust/system/cuda/detail/util.h>
 
-THRUST_BEGIN_NS
-namespace cuda_cub {
+#include <cub/device/dispatch/dispatch_scan_by_key.cuh>
+#include <cub/util_type.cuh>
 
-namespace __scan_by_key {
-  namespace mpl = thrust::detail::mpl::math;
+THRUST_NAMESPACE_BEGIN
+namespace cuda_cub
+{
+namespace detail
+{
 
-  template <int                      _BLOCK_THREADS,
-            int                      _ITEMS_PER_THREAD = 1,
-            cub::BlockLoadAlgorithm  _LOAD_ALGORITHM   = cub::BLOCK_LOAD_DIRECT,
-            cub::CacheLoadModifier   _LOAD_MODIFIER    = cub::LOAD_DEFAULT,
-            cub::BlockScanAlgorithm  _SCAN_ALGORITHM   = cub::BLOCK_SCAN_WARP_SCANS,
-            cub::BlockStoreAlgorithm _STORE_ALGORITHM  = cub::BLOCK_STORE_DIRECT,
-            int                      _MIN_BLOCKS       = 1>
-  struct PtxPolicy
-  {
-    enum
-    {
-      BLOCK_THREADS    = _BLOCK_THREADS,
-      ITEMS_PER_THREAD = _ITEMS_PER_THREAD,
-      ITEMS_PER_TILE   = BLOCK_THREADS * ITEMS_PER_THREAD,
-      MIN_BLOCKS       = _MIN_BLOCKS
-    };
-
-    static const cub::BlockLoadAlgorithm  LOAD_ALGORITHM  = _LOAD_ALGORITHM;
-    static const cub::CacheLoadModifier   LOAD_MODIFIER   = _LOAD_MODIFIER;
-    static const cub::BlockScanAlgorithm  SCAN_ALGORITHM  = _SCAN_ALGORITHM;
-    static const cub::BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM;
-  };    // struct PtxPolicy
-
-  template <class Arch, class Key, class Value>
-  struct Tuning;
-  
-  template <class Key, class Value>
-  struct Tuning<sm30, Key, Value>
-  {
-    enum
-    {
-      MAX_INPUT_BYTES      = mpl::max<size_t, sizeof(Key), sizeof(Value)>::value,
-      COMBINED_INPUT_BYTES = sizeof(Key) + sizeof(Value),
-
-      NOMINAL_4B_ITEMS_PER_THREAD = 6,
-
-      ITEMS_PER_THREAD = mpl::min<
-          int,
-          NOMINAL_4B_ITEMS_PER_THREAD,
-          mpl::max<
-              int,
-              1,
-              ((NOMINAL_4B_ITEMS_PER_THREAD * 8) +
-               COMBINED_INPUT_BYTES - 1) /
-                  COMBINED_INPUT_BYTES>::value>::value,
-    };
-
-    typedef PtxPolicy<128,
-                      ITEMS_PER_THREAD,
-                      cub::BLOCK_LOAD_WARP_TRANSPOSE,
-                      cub::LOAD_DEFAULT,
-                      cub::BLOCK_SCAN_WARP_SCANS,
-                      cub::BLOCK_STORE_WARP_TRANSPOSE>
-        type;
-  };    // Tuning sm30
-
-  template <class Key, class Value>
-  struct Tuning<sm35, Key, Value> : Tuning<sm30, Key, Value>
-  {
-    enum
-    {
-      NOMINAL_4B_ITEMS_PER_THREAD = 6,
-
-      ITEMS_PER_THREAD =
-          (Tuning::MAX_INPUT_BYTES <= 8)
-              ? 6
-              : mpl::min<
-                    int,
-                    NOMINAL_4B_ITEMS_PER_THREAD,
-                    mpl::max<
-                        int,
-                        1,
-                        ((NOMINAL_4B_ITEMS_PER_THREAD * 8) +
-                         Tuning::COMBINED_INPUT_BYTES - 1) /
-                            Tuning::COMBINED_INPUT_BYTES>::value>::value,
-    };
-
-    typedef PtxPolicy<128,
-                      ITEMS_PER_THREAD,
-                      cub::BLOCK_LOAD_WARP_TRANSPOSE,
-                      cub::LOAD_LDG,
-                      cub::BLOCK_SCAN_WARP_SCANS,
-                      cub::BLOCK_STORE_WARP_TRANSPOSE>
-        type;
-  };    // Tuning sm35
-
-  template <class Key, class Value>
-  struct Tuning<sm52, Key, Value> : Tuning<sm30, Key, Value>
-  {
-    enum
-    {
-      NOMINAL_4B_ITEMS_PER_THREAD = 9,
-
-      ITEMS_PER_THREAD =
-          (Tuning::MAX_INPUT_BYTES <= 8)
-              ? 9
-              : mpl::min<
-                    int,
-                    NOMINAL_4B_ITEMS_PER_THREAD,
-                    mpl::max<
-                        int,
-                        1,
-                        ((NOMINAL_4B_ITEMS_PER_THREAD * 8) +
-                         Tuning::COMBINED_INPUT_BYTES - 1) /
-                            Tuning::COMBINED_INPUT_BYTES>::value>::value,
-    };
-
-    typedef PtxPolicy<256,
-                      ITEMS_PER_THREAD,
-                      cub::BLOCK_LOAD_WARP_TRANSPOSE,
-                      cub::LOAD_LDG,
-                      cub::BLOCK_SCAN_WARP_SCANS,
-                      cub::BLOCK_STORE_WARP_TRANSPOSE>
-        type;
-  };    // Tuning sm52
-
-  template <class KeysInputIt,
-            class ValuesInputIt,
-            class ValuesOutputIt,
-            class EqualityOp,
-            class ScanOp,
-            class Size,
-            class T,
-            class Inclusive>
-  struct ScanByKeyAgent
+__thrust_exec_check_disable__
+template <typename Derived,
+          typename KeysInIt,
+          typename ValuesInIt,
+          typename ValuesOutIt,
+          typename EqualityOpT,
+          typename ScanOpT,
+          typename SizeT>
+__host__ __device__
+ValuesOutIt inclusive_scan_by_key_n(
+  thrust::cuda_cub::execution_policy<Derived>& policy,
+  KeysInIt keys,
+  ValuesInIt values,
+  ValuesOutIt result,
+  SizeT num_items,
+  EqualityOpT equality_op,
+  ScanOpT scan_op)
+{
+  if (num_items == 0)
   {
-    typedef typename iterator_traits<KeysInputIt>::value_type key_type;
-
-    typedef T    value_type;
-    typedef Size size_type;
-
-    typedef cub::KeyValuePair<size_type, value_type> size_value_pair_t;
-    typedef cub::KeyValuePair<key_type, value_type> key_value_pair_t;
-
-    typedef cub::ReduceByKeyScanTileState<value_type, size_type> ScanTileState;
-    typedef cub::ReduceBySegmentOp<ScanOp> ReduceBySegmentOp;
-
-    template <class Arch>
-    struct PtxPlan : Tuning<Arch, key_type, value_type>::type
-    {
-      typedef Tuning<Arch, key_type, value_type> tuning;
-
-      typedef typename core::LoadIterator<PtxPlan, KeysInputIt>::type   KeysLoadIt;
-      typedef typename core::LoadIterator<PtxPlan, ValuesInputIt>::type ValuesLoadIt;
-
-      typedef typename core::BlockLoad<PtxPlan, KeysLoadIt, key_type>::type     BlockLoadKeys;
-      typedef typename core::BlockLoad<PtxPlan, ValuesLoadIt, value_type>::type BlockLoadValues;
-
-      typedef typename core::BlockStore<PtxPlan,
-                                        ValuesOutputIt,
-                                        value_type>::type BlockStoreValues;
-
-      typedef cub::BlockDiscontinuity<key_type,
-                                      PtxPlan::BLOCK_THREADS,
-                                      1,
-                                      1,
-                                      Arch::ver>
-          BlockDiscontinuityKeys;
-
-      typedef cub::TilePrefixCallbackOp<size_value_pair_t,
-                                        ReduceBySegmentOp,
-                                        ScanTileState,
-                                        Arch::ver>
-          TilePrefixCallback;
-      typedef cub::BlockScan<size_value_pair_t,
-                             PtxPlan::BLOCK_THREADS,
-                             PtxPlan::SCAN_ALGORITHM,
-                             1,
-                             1,
-                             Arch::ver>
-          BlockScan;
-
-      union TempStorage
-      {
-        struct
-        {
-          typename BlockScan::TempStorage              scan;
-          typename TilePrefixCallback::TempStorage     prefix;
-          typename BlockDiscontinuityKeys::TempStorage discontinuity;
-        };
-
-        typename BlockLoadKeys::TempStorage   load_keys;
-        typename BlockLoadValues::TempStorage load_values;
-
-        typename BlockStoreValues::TempStorage store_values;
-      };    // union TempStorage
-    };      // struct PtxPlan
-    
-    typedef typename core::specialize_plan_msvc10_war<PtxPlan>::type::type ptx_plan;
-
-    typedef typename ptx_plan::KeysLoadIt   KeysLoadIt;
-    typedef typename ptx_plan::ValuesLoadIt ValuesLoadIt;
-
-    typedef typename ptx_plan::BlockLoadKeys    BlockLoadKeys;
-    typedef typename ptx_plan::BlockLoadValues  BlockLoadValues;
-    typedef typename ptx_plan::BlockStoreValues BlockStoreValues;
-
-    typedef typename ptx_plan::BlockDiscontinuityKeys BlockDiscontinuityKeys;
-    typedef typename ptx_plan::TilePrefixCallback     TilePrefixCallback;
-    typedef typename ptx_plan::BlockScan              BlockScan;
-    typedef typename ptx_plan::TempStorage            TempStorage;
-
-    enum
-    {
-      BLOCK_THREADS     = ptx_plan::BLOCK_THREADS,
-      ITEMS_PER_THREAD  = ptx_plan::ITEMS_PER_THREAD,
-      ITEMS_PER_TILE    = ptx_plan::ITEMS_PER_TILE,
-    };
-    
-    struct impl
-    {
-      //---------------------------------------------------------------------
-      // Per thread data
-      //---------------------------------------------------------------------
-
-      TempStorage &  storage;
-      ScanTileState &tile_state;
-
-      KeysLoadIt     keys_load_it;
-      ValuesLoadIt   values_load_it;
-      ValuesOutputIt values_output_it;
-
-      cub::InequalityWrapper<EqualityOp> inequality_op;
-      ReduceBySegmentOp                  scan_op;
-
-
-      //---------------------------------------------------------------------
-      // Block scan utility methods (first tile)
-      //---------------------------------------------------------------------
-
-      // Exclusive scan specialization
-      //
-      THRUST_DEVICE_FUNCTION void
-      scan_tile(size_value_pair_t (&scan_items)[ITEMS_PER_THREAD],
-                size_value_pair_t &tile_aggregate,
-                thrust::detail::false_type /* is_inclusive */)
-      {
-        BlockScan(storage.scan)
-            .ExclusiveScan(scan_items, scan_items, scan_op, tile_aggregate);
-      }
-      
-      // Inclusive scan specialization
-      //
-      THRUST_DEVICE_FUNCTION void
-      scan_tile(size_value_pair_t (&scan_items)[ITEMS_PER_THREAD],
-                size_value_pair_t &tile_aggregate,
-                thrust::detail::true_type /* is_inclusive */)
-      {
-        BlockScan(storage.scan)
-            .InclusiveScan(scan_items, scan_items, scan_op, tile_aggregate);
-      }
-      
-      //---------------------------------------------------------------------
-      // Block scan utility methods (subsequent tiles)
-      //---------------------------------------------------------------------
-      
-      // Exclusive scan specialization (with prefix from predecessors)
-      //
-      THRUST_DEVICE_FUNCTION void
-      scan_tile(size_value_pair_t (&scan_items)[ITEMS_PER_THREAD],
-                size_value_pair_t & tile_aggregate,
-                TilePrefixCallback &prefix_op,
-                thrust::detail::false_type /* is_incclusive */)
-      {
-        BlockScan(storage.scan)
-            .ExclusiveScan(scan_items, scan_items, scan_op, prefix_op);
-        tile_aggregate = prefix_op.GetBlockAggregate();
-      }
-      
-      // Inclusive scan specialization (with prefix from predecessors)
-      //
-      THRUST_DEVICE_FUNCTION void
-      scan_tile(size_value_pair_t (&scan_items)[ITEMS_PER_THREAD],
-                size_value_pair_t & tile_aggregate,
-                TilePrefixCallback &prefix_op,
-                thrust::detail::true_type /* is_inclusive */)
-      {
-        BlockScan(storage.scan)
-            .InclusiveScan(scan_items, scan_items, scan_op, prefix_op);
-        tile_aggregate = prefix_op.GetBlockAggregate();
-      }
-      
-      //---------------------------------------------------------------------
-      // Zip utility methods
-      //---------------------------------------------------------------------
-
-      template <bool IS_LAST_TILE>
-      THRUST_DEVICE_FUNCTION void
-      zip_values_and_flags(size_type num_remaining,
-                           value_type (&values)[ITEMS_PER_THREAD],
-                           size_type (&segment_flags)[ITEMS_PER_THREAD],
-                           size_value_pair_t (&scan_items)[ITEMS_PER_THREAD])
-      {
-        // Zip values and segment_flags
-#pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-          // Set segment_flags for first out-of-bounds item, zero for others
-          if (IS_LAST_TILE &&
-              Size(threadIdx.x * ITEMS_PER_THREAD) + ITEM == num_remaining)
-            segment_flags[ITEM] = 1;
-
-          scan_items[ITEM].value = values[ITEM];
-          scan_items[ITEM].key   = segment_flags[ITEM];
-        }
-      }
-
-      THRUST_DEVICE_FUNCTION void unzip_values(
-          value_type (&values)[ITEMS_PER_THREAD],
-          size_value_pair_t (&scan_items)[ITEMS_PER_THREAD])
-      {
-        // Zip values and segment_flags
-#pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-          values[ITEM] = scan_items[ITEM].value;
-        }
-      }
-      
-      //---------------------------------------------------------------------
-      // Cooperatively scan a device-wide sequence of tiles with other CTAs
-      //---------------------------------------------------------------------
-
-      // Process a tile of input (dynamic chained scan)
-      //
-      template <bool IS_LAST_TILE, class AddInitToScan>
-      THRUST_DEVICE_FUNCTION void
-      consume_tile(Size          /*num_items*/,
-                   Size          num_remaining,
-                   int           tile_idx,
-                   Size          tile_base,
-                   AddInitToScan add_init_to_scan)
-      {
-        using core::sync_threadblock;
-
-        // Load items
-        key_type          keys[ITEMS_PER_THREAD];
-        value_type        values[ITEMS_PER_THREAD];
-        size_type         segment_flags[ITEMS_PER_THREAD];
-        size_value_pair_t scan_items[ITEMS_PER_THREAD];
-
-        if (IS_LAST_TILE)
-        {
-          // Fill last element with the first element
-          // because collectives are not suffix guarded
-          BlockLoadKeys(storage.load_keys)
-              .Load(keys_load_it + tile_base,
-                    keys,
-                    num_remaining,
-                    *(keys_load_it + tile_base));
-        }
-        else
-        {
-          BlockLoadKeys(storage.load_keys)
-              .Load(keys_load_it + tile_base, keys);
-        }
-
-        sync_threadblock();
-        
-        if (IS_LAST_TILE)
-        {
-          // Fill last element with the first element
-          // because collectives are not suffix guarded
-          BlockLoadValues(storage.load_values)
-              .Load(values_load_it + tile_base,
-                    values,
-                    num_remaining,
-                    *(values_load_it + tile_base));
-        }
-        else
-        {
-          BlockLoadValues(storage.load_values)
-              .Load(values_load_it + tile_base, values);
-        }
-        
-        sync_threadblock();
-
-        // first tile
-        if (tile_idx == 0)
-        {
-          BlockDiscontinuityKeys(storage.discontinuity)
-            .FlagHeads(segment_flags, keys, inequality_op);
-        
-          // Zip values and segment_flags
-          zip_values_and_flags<IS_LAST_TILE>(num_remaining,
-                                             values,
-                                             segment_flags,
-                                             scan_items);
-
-          // Exclusive scan of values and segment_flags
-          size_value_pair_t tile_aggregate;
-          scan_tile(scan_items, tile_aggregate, Inclusive());
-
-          if (threadIdx.x == 0)
-          {
-            if (!IS_LAST_TILE)
-              tile_state.SetInclusive(0, tile_aggregate);
-
-            scan_items[0].key = 0;
-          }
-        }
-        else
-        {
-          key_type tile_pred_key = (threadIdx.x == 0)
-                                       ? keys_load_it[tile_base - 1]
-                                       : key_type();
-          BlockDiscontinuityKeys(storage.discontinuity)
-              .FlagHeads(segment_flags,
-                         keys,
-                         inequality_op,
-                         tile_pred_key);
-        
-          // Zip values and segment_flags
-          zip_values_and_flags<IS_LAST_TILE>(num_remaining,
-                                             values,
-                                             segment_flags,
-                                             scan_items);
-
-          size_value_pair_t  tile_aggregate;
-          TilePrefixCallback prefix_op(tile_state, storage.prefix, scan_op, tile_idx);
-          scan_tile(scan_items, tile_aggregate, prefix_op, Inclusive());
-        }
-
-        sync_threadblock();
-
-        unzip_values(values, scan_items);
-
-        add_init_to_scan(values, segment_flags);
-
-        // Store items
-        if (IS_LAST_TILE)
-        {
-          BlockStoreValues(storage.store_values)
-            .Store(values_output_it + tile_base, values, num_remaining);
-        }
-        else
-        {
-          BlockStoreValues(storage.store_values)
-            .Store(values_output_it + tile_base, values);
-        }
-      }
-
-      //---------------------------------------------------------------------
-      // Constructor
-      //---------------------------------------------------------------------
-      
-      // Dequeue and scan tiles of items as part of a dynamic chained scan
-      // with Init functor
-      template <class AddInitToScan>
-      THRUST_DEVICE_FUNCTION
-      impl(TempStorage &  storage_,
-           ScanTileState &tile_state_,
-           KeysInputIt    keys_input_it,
-           ValuesInputIt  values_input_it,
-           ValuesOutputIt values_output_it_,
-           EqualityOp     equality_op_,
-           ScanOp         scan_op_,
-           Size           num_items,
-           AddInitToScan  add_init_to_scan)
-          : storage(storage_),
-            tile_state(tile_state_),
-            keys_load_it(core::make_load_iterator(ptx_plan(), keys_input_it)),
-            values_load_it(core::make_load_iterator(ptx_plan(), values_input_it)),
-            values_output_it(values_output_it_),
-            inequality_op(equality_op_),
-            scan_op(scan_op_)
-      {
-        int  tile_idx      = blockIdx.x;
-        Size tile_base     = ITEMS_PER_TILE * tile_idx;
-        Size num_remaining = num_items - tile_base;
-
-        if (num_remaining > ITEMS_PER_TILE)
-        {
-          // Not the last tile (full)
-          consume_tile<false>(num_items,
-                              num_remaining,
-                              tile_idx,
-                              tile_base,
-                              add_init_to_scan);
-        }
-        else if (num_remaining > 0)
-        {
-          // The last tile (possibly partially-full)
-          consume_tile<true>(num_items,
-                             num_remaining,
-                             tile_idx,
-                             tile_base,
-                             add_init_to_scan);
-        }
-      }
-    };    // struct impl
-
-    //---------------------------------------------------------------------
-    // Agent entry point
-    //---------------------------------------------------------------------
-
-    template <class AddInitToScan>
-    THRUST_AGENT_ENTRY(KeysInputIt    keys_input_it,
-                       ValuesInputIt  values_input_it,
-                       ValuesOutputIt values_output_it,
-                       EqualityOp     equaility_op,
-                       ScanOp         scan_op,
-                       ScanTileState  tile_state,
-                       Size           num_items,
-                       AddInitToScan  add_init_to_scan,
-                       char *         shmem)
-    {
-      TempStorage &storage = *reinterpret_cast<TempStorage *>(shmem);
-      impl(storage,
-           tile_state,
-           keys_input_it,
-           values_input_it,
-           values_output_it,
-           equaility_op,
-           scan_op,
-           num_items,
-           add_init_to_scan);
-    }
-
-  };    // struct ScanByKeyAgent
-  
-  template <class ScanTileState,
-            class Size>
-  struct InitAgent
+    return result;
+  }
+
+  // Convert to raw pointers if possible:
+  using KeysInUnwrapIt =
+    thrust::detail::try_unwrap_contiguous_iterator_return_t<KeysInIt>;
+  using ValuesInUnwrapIt =
+    thrust::detail::try_unwrap_contiguous_iterator_return_t<ValuesInIt>;
+  using ValuesOutUnwrapIt =
+    thrust::detail::try_unwrap_contiguous_iterator_return_t<ValuesOutIt>;
+  using AccumT = typename thrust::iterator_traits<ValuesInUnwrapIt>::value_type;
+
+  auto keys_unwrap = thrust::detail::try_unwrap_contiguous_iterator(keys);
+  auto values_unwrap = thrust::detail::try_unwrap_contiguous_iterator(values);
+  auto result_unwrap = thrust::detail::try_unwrap_contiguous_iterator(result);
+
+  using Dispatch32 = cub::DispatchScanByKey<KeysInUnwrapIt,
+                                            ValuesInUnwrapIt,
+                                            ValuesOutUnwrapIt,
+                                            EqualityOpT,
+                                            ScanOpT,
+                                            cub::NullType,
+                                            thrust::detail::int32_t,
+                                            AccumT>;
+  using Dispatch64 = cub::DispatchScanByKey<KeysInUnwrapIt,
+                                            ValuesInUnwrapIt,
+                                            ValuesOutUnwrapIt,
+                                            EqualityOpT,
+                                            ScanOpT,
+                                            cub::NullType,
+                                            thrust::detail::int64_t,
+                                            AccumT>;
+
+  cudaStream_t stream = thrust::cuda_cub::stream(policy);
+  cudaError_t status{};
+
+  // Determine temporary storage requirements:
+  std::size_t tmp_size = 0;
   {
-    template <class Arch>
-    struct PtxPlan : PtxPolicy<128> {};
-   
-    typedef core::specialize_plan<PtxPlan> ptx_plan;
-
-    //---------------------------------------------------------------------
-    // Agent entry point
-    //---------------------------------------------------------------------
-
-    THRUST_AGENT_ENTRY(ScanTileState tile_state,
-                       Size          num_tiles,
-                       char * /*shmem*/)
-    {
-      tile_state.InitializeStatus(num_tiles);
-    }
-  }; // struct InitAgent
-  
-  template<class T>
-  struct DoNothing
+    THRUST_INDEX_TYPE_DISPATCH2(status,
+                                Dispatch32::Dispatch,
+                                Dispatch64::Dispatch,
+                                num_items,
+                                (nullptr,
+                                 tmp_size,
+                                 keys_unwrap,
+                                 values_unwrap,
+                                 result_unwrap,
+                                 equality_op,
+                                 scan_op,
+                                 cub::NullType{},
+                                 num_items_fixed,
+                                 stream));
+    thrust::cuda_cub::throw_on_error(status,
+                                     "after determining tmp storage "
+                                     "requirements for inclusive_scan_by_key");
+  }
+
+  // Run scan:
   {
-    typedef T     type;
-    template <int ITEMS_PER_THREAD, class Size>
-    THRUST_DEVICE_FUNCTION void
-    operator()(T (&/*items*/)[ITEMS_PER_THREAD],
-               Size (&/*flags*/)[ITEMS_PER_THREAD])
-    {
-    }
-  };    // struct DoNothing
-
-  template<class T, class ScanOp>
-  struct AddInitToScan
+    // Allocate temporary storage:
+    thrust::detail::temporary_array<thrust::detail::uint8_t, Derived> tmp{
+      policy,
+      tmp_size};
+
+    THRUST_INDEX_TYPE_DISPATCH2(status,
+                                Dispatch32::Dispatch,
+                                Dispatch64::Dispatch,
+                                num_items,
+                                (tmp.data().get(),
+                                 tmp_size,
+                                 keys_unwrap,
+                                 values_unwrap,
+                                 result_unwrap,
+                                 equality_op,
+                                 scan_op,
+                                 cub::NullType{},
+                                 num_items_fixed,
+                                 stream));
+
+    thrust::cuda_cub::throw_on_error(
+      status, "after dispatching inclusive_scan_by_key kernel");
+
+    thrust::cuda_cub::throw_on_error(
+      thrust::cuda_cub::synchronize_optional(policy),
+      "inclusive_scan_by_key failed to synchronize");
+  }
+
+  return result + num_items;
+}
+
+__thrust_exec_check_disable__
+template <typename Derived,
+          typename KeysInIt,
+          typename ValuesInIt,
+          typename ValuesOutIt,
+          typename InitValueT,
+          typename EqualityOpT,
+          typename ScanOpT,
+          typename SizeT>
+__host__ __device__
+ValuesOutIt exclusive_scan_by_key_n(
+  thrust::cuda_cub::execution_policy<Derived>& policy,
+  KeysInIt keys,
+  ValuesInIt values,
+  ValuesOutIt result,
+  SizeT num_items,
+  InitValueT init_value,
+  EqualityOpT equality_op,
+  ScanOpT scan_op)
+{
+
+  if (num_items == 0)
   {
-    typedef T type;
-    T         init;
-    ScanOp    scan_op;
-
-    THRUST_RUNTIME_FUNCTION
-    AddInitToScan(T init_, ScanOp scan_op_)
-        : init(init_), scan_op(scan_op_) {}
-
-    template <int ITEMS_PER_THREAD, class Size>
-    THRUST_DEVICE_FUNCTION void
-    operator()(T (&items)[ITEMS_PER_THREAD],
-               Size (&flags)[ITEMS_PER_THREAD])
-    {
-#pragma unroll
-      for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-      {
-        items[ITEM] = flags[ITEM] ? init : scan_op(init, items[ITEM]);
-      }
-    }
-  };    // struct AddInitToScan
-
-  template <class Inclusive,
-            class KeysInputIt,
-            class ValuesInputIt,
-            class ValuesOutputIt,
-            class EqualityOp,
-            class ScanOp,
-            class Size,
-            class AddInitToScan>
-  THRUST_RUNTIME_FUNCTION cudaError_t
-  doit_step(void *         d_temp_storage,
-            size_t &       temp_storage_bytes,
-            KeysInputIt    keys_input_it,
-            ValuesInputIt  values_input_it,
-            Size           num_items,
-            ValuesOutputIt values_output_it,
-            EqualityOp     equality_op,
-            ScanOp         scan_op,
-            AddInitToScan  add_init_to_scan,
-            cudaStream_t   stream,
-            bool           debug_sync)
+    return result;
+  }
+
+  // Convert to raw pointers if possible:
+  using KeysInUnwrapIt =
+    thrust::detail::try_unwrap_contiguous_iterator_return_t<KeysInIt>;
+  using ValuesInUnwrapIt =
+    thrust::detail::try_unwrap_contiguous_iterator_return_t<ValuesInIt>;
+  using ValuesOutUnwrapIt =
+    thrust::detail::try_unwrap_contiguous_iterator_return_t<ValuesOutIt>;
+
+  auto keys_unwrap = thrust::detail::try_unwrap_contiguous_iterator(keys);
+  auto values_unwrap = thrust::detail::try_unwrap_contiguous_iterator(values);
+  auto result_unwrap = thrust::detail::try_unwrap_contiguous_iterator(result);
+
+  using Dispatch32 = cub::DispatchScanByKey<KeysInUnwrapIt,
+                                            ValuesInUnwrapIt,
+                                            ValuesOutUnwrapIt,
+                                            EqualityOpT,
+                                            ScanOpT,
+                                            InitValueT,
+                                            thrust::detail::int32_t,
+                                            InitValueT>;
+  using Dispatch64 = cub::DispatchScanByKey<KeysInUnwrapIt,
+                                            ValuesInUnwrapIt,
+                                            ValuesOutUnwrapIt,
+                                            EqualityOpT,
+                                            ScanOpT,
+                                            InitValueT,
+                                            thrust::detail::int64_t,
+                                            InitValueT>;
+
+  cudaStream_t stream = thrust::cuda_cub::stream(policy);
+  cudaError_t status{};
+
+  // Determine temporary storage requirements:
+  std::size_t tmp_size = 0;
   {
-    using core::AgentPlan;
-    using core::AgentLauncher;
-
-    cudaError_t status = cudaSuccess;
-    if (num_items == 0)
-      return cudaErrorNotSupported;
-
-    typedef typename AddInitToScan::type T;
-
-    typedef AgentLauncher<
-        ScanByKeyAgent<KeysInputIt,
-                       ValuesInputIt,
-                       ValuesOutputIt,
-                       EqualityOp,
-                       ScanOp,
-                       Size,
-                       T,
-                       Inclusive> >
-        scan_by_key_agent;
-
-    typedef typename scan_by_key_agent::ScanTileState ScanTileState;
-
-    typedef AgentLauncher<InitAgent<ScanTileState, Size> > init_agent;
-
-    AgentPlan scan_by_key_plan = scan_by_key_agent::get_plan(stream);
-    AgentPlan init_plan        = init_agent::get_plan();
-
-    int tile_size = scan_by_key_plan.items_per_tile;
-    size_t num_tiles = (num_items + tile_size - 1) / tile_size;
-
-    size_t vshmem_size = core::vshmem_size(scan_by_key_plan.shared_memory_size,
-                                           num_tiles);
-
-    size_t allocation_sizes[2] = {0, vshmem_size};
-    status = ScanTileState::AllocationSize(static_cast<int>(num_tiles), allocation_sizes[0]);
-    CUDA_CUB_RET_IF_FAIL(status);
-
-    void *allocations[2] = {NULL, NULL};
-    status               = cub::AliasTemporaries(d_temp_storage,
-                                   temp_storage_bytes,
-                                   allocations,
-                                   allocation_sizes);
-    CUDA_CUB_RET_IF_FAIL(status);
-
-    if (d_temp_storage == NULL)
-    {
-      return status;
-    }
-
-    ScanTileState tile_state;
-    status = tile_state.Init(static_cast<int>(num_tiles), allocations[0], allocation_sizes[0]);
-    CUDA_CUB_RET_IF_FAIL(status);
-
-    char *vshmem_ptr = vshmem_size > 0 ? (char*)allocations[1] : NULL;
-
-    init_agent ia(init_plan, num_tiles, stream, "scan_by_key::init_agent", debug_sync);
-    ia.launch(tile_state, num_tiles);
-    CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
-
-    scan_by_key_agent sbka(scan_by_key_plan, num_items, stream, vshmem_ptr, "scan_by_key::scan_agent", debug_sync);
-    sbka.launch(keys_input_it,
-                values_input_it,
-                values_output_it,
-                equality_op,
-                scan_op,
-                tile_state,
-                num_items,
-                add_init_to_scan);
-    CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
-    return status;
-  }    // func doit_pass
-
-  template <typename Inclusive,
-            typename Derived,
-            typename KeysInputIt,
-            typename ValuesInputIt,
-            typename ValuesOutputIt,
-            typename EqualityOp,
-            typename ScanOp,
-            typename AddInitToScan>
-  THRUST_RUNTIME_FUNCTION
-  ValuesOutputIt scan_by_key(execution_policy<Derived>& policy,
-                             KeysInputIt                keys_first,
-                             KeysInputIt                keys_last,
-                             ValuesInputIt              values_first,
-                             ValuesOutputIt             values_result,
-                             EqualityOp                 equality_op,
-                             ScanOp                     scan_op,
-                             AddInitToScan              add_init_to_scan)
+    THRUST_INDEX_TYPE_DISPATCH2(status,
+                                Dispatch32::Dispatch,
+                                Dispatch64::Dispatch,
+                                num_items,
+                                (nullptr,
+                                 tmp_size,
+                                 keys_unwrap,
+                                 values_unwrap,
+                                 result_unwrap,
+                                 equality_op,
+                                 scan_op,
+                                 init_value,
+                                 num_items_fixed,
+                                 stream));
+    thrust::cuda_cub::throw_on_error(status,
+                                     "after determining tmp storage "
+                                     "requirements for exclusive_scan_by_key");
+  }
+
+  // Run scan:
   {
-    int          num_items    = static_cast<int>(thrust::distance(keys_first, keys_last));
-    size_t       storage_size = 0;
-    cudaStream_t stream       = cuda_cub::stream(policy);
-    bool         debug_sync   = THRUST_DEBUG_SYNC_FLAG;
-
-    if (num_items == 0)
-      return values_result;
-    
-    cudaError_t status;
-    status = doit_step<Inclusive>(NULL,
-                                  storage_size,
-                                  keys_first,
-                                  values_first,
-                                  num_items,
-                                  values_result,
-                                  equality_op,
-                                  scan_op,
-                                  add_init_to_scan,
-                                  stream,
-                                  debug_sync);
-    cuda_cub::throw_on_error(status, "scan_by_key: failed on 1st step");
-    
-    // Allocate temporary storage.
-    thrust::detail::temporary_array<thrust::detail::uint8_t, Derived>
-      tmp(policy, storage_size);
-    void *ptr = static_cast<void*>(tmp.data().get());
-
-    status = doit_step<Inclusive>(ptr,
-                                  storage_size,
-                                  keys_first,
-                                  values_first,
-                                  num_items,
-                                  values_result,
-                                  equality_op,
-                                  scan_op,
-                                  add_init_to_scan,
-                                  stream,
-                                  debug_sync);
-    cuda_cub::throw_on_error(status, "scan_by_key: failed on 2nd step");
-    
-    status = cuda_cub::synchronize(policy);
-    cuda_cub::throw_on_error(status, "scan_by_key: failed to synchronize");
-
-    return values_result + num_items;
-  }    // func doit
-}    // namspace scan_by_key
+    // Allocate temporary storage:
+    thrust::detail::temporary_array<thrust::detail::uint8_t, Derived> tmp{
+      policy,
+      tmp_size};
+
+    THRUST_INDEX_TYPE_DISPATCH2(status,
+                                Dispatch32::Dispatch,
+                                Dispatch64::Dispatch,
+                                num_items,
+                                (tmp.data().get(),
+                                 tmp_size,
+                                 keys_unwrap,
+                                 values_unwrap,
+                                 result_unwrap,
+                                 equality_op,
+                                 scan_op,
+                                 init_value,
+                                 num_items_fixed,
+                                 stream));
+
+    thrust::cuda_cub::throw_on_error(
+      status, "after dispatching exclusive_scan_by_key kernel");
+
+    thrust::cuda_cub::throw_on_error(
+      thrust::cuda_cub::synchronize_optional(policy),
+      "exclusive_scan_by_key failed to synchronize");
+  }
+
+  return result + num_items;
+}
+
+
+} // namespace detail
 
 //-------------------------
 // Thrust API entry points
@@ -805,30 +309,23 @@ inclusive_scan_by_key(execution_policy<Derived> &policy,
                       ScanOp                     scan_op)
 {
   ValOutputIt ret = value_result;
-  if (__THRUST_HAS_CUDART__)
-  {
-    typedef typename iterator_traits<ValInputIt>::value_type T;
-    ret = __scan_by_key::scan_by_key<thrust::detail::true_type>(policy,
-                                                        key_first,
-                                                        key_last,
-                                                        value_first,
-                                                        value_result,
-                                                        binary_pred,
-                                                        scan_op,
-                                                        __scan_by_key::DoNothing<T>());
-  }
-  else
-  {
-#if !__THRUST_HAS_CUDART__
-    ret = thrust::inclusive_scan_by_key(cvt_to_seq(derived_cast(policy)),
-                                        key_first,
-                                        key_last,
-                                        value_first,
-                                        value_result,
-                                        binary_pred,
-                                        scan_op);
-#endif
-  }
+  THRUST_CDP_DISPATCH(
+    (ret = thrust::cuda_cub::detail::inclusive_scan_by_key_n(
+       policy,
+       key_first,
+       value_first,
+       value_result,
+       thrust::distance(key_first, key_last),
+       binary_pred,
+       scan_op);),
+    (ret = thrust::inclusive_scan_by_key(cvt_to_seq(derived_cast(policy)),
+                                         key_first,
+                                         key_last,
+                                         value_first,
+                                         value_result,
+                                         binary_pred,
+                                         scan_op);));
+
   return ret;
 }
 
@@ -845,14 +342,13 @@ inclusive_scan_by_key(execution_policy<Derived> &policy,
                       ValOutputIt                value_result,
                       BinaryPred                 binary_pred)
 {
-  typedef typename thrust::iterator_traits<ValOutputIt>::value_type value_type;
   return cuda_cub::inclusive_scan_by_key(policy,
                                          key_first,
                                          key_last,
                                          value_first,
                                          value_result,
                                          binary_pred,
-                                         plus<value_type>());
+                                         thrust::plus<>());
 }
 
 template <class Derived,
@@ -866,13 +362,12 @@ inclusive_scan_by_key(execution_policy<Derived> &policy,
                       ValInputIt                 value_first,
                       ValOutputIt                value_result)
 {
-  typedef typename thrust::iterator_traits<KeyInputIt>::value_type key_type;
   return cuda_cub::inclusive_scan_by_key(policy,
                                          key_first,
                                          key_last,
                                          value_first,
                                          value_result,
-                                         equal_to<key_type>());
+                                         thrust::equal_to<>());
 }
 
 
@@ -899,31 +394,24 @@ exclusive_scan_by_key(execution_policy<Derived> &policy,
                       ScanOp                     scan_op)
 {
   ValOutputIt ret = value_result;
-  if (__THRUST_HAS_CUDART__)
-  {
-    ret = __scan_by_key::scan_by_key<thrust::detail::false_type>(
-        policy,
-        key_first,
-        key_last,
-        value_first,
-        value_result,
-        binary_pred,
-        scan_op,
-        __scan_by_key::AddInitToScan<Init, ScanOp>(init, scan_op));
-  }
-  else
-  {
-#if !__THRUST_HAS_CUDART__
-    ret = thrust::exclusive_scan_by_key(cvt_to_seq(derived_cast(policy)),
-                                        key_first,
-                                        key_last,
-                                        value_first,
-                                        value_result,
-                                        init,
-                                        binary_pred,
-                                        scan_op);
-#endif
-  }
+  THRUST_CDP_DISPATCH(
+    (ret = thrust::cuda_cub::detail::exclusive_scan_by_key_n(
+       policy,
+       key_first,
+       value_first,
+       value_result,
+       thrust::distance(key_first, key_last),
+       init,
+       binary_pred,
+       scan_op);),
+    (ret = thrust::exclusive_scan_by_key(cvt_to_seq(derived_cast(policy)),
+                                         key_first,
+                                         key_last,
+                                         value_first,
+                                         value_result,
+                                         init,
+                                         binary_pred,
+                                         scan_op);));
   return ret;
 }
 
@@ -949,7 +437,7 @@ exclusive_scan_by_key(execution_policy<Derived> &policy,
                                          value_result,
                                          init,
                                          binary_pred,
-                                         plus<Init>());
+                                         thrust::plus<>());
 }
 
 template <class Derived,
@@ -965,14 +453,13 @@ exclusive_scan_by_key(execution_policy<Derived> &policy,
                       ValOutputIt                value_result,
                       Init                       init)
 {
-  typedef typename iterator_traits<KeyInputIt>::value_type key_type;
   return cuda_cub::exclusive_scan_by_key(policy,
                                          key_first,
                                          key_last,
                                          value_first,
                                          value_result,
                                          init,
-                                         equal_to<key_type>());
+                                         thrust::equal_to<>());
 }
 
 
@@ -987,19 +474,19 @@ exclusive_scan_by_key(execution_policy<Derived> &policy,
                       ValInputIt                 value_first,
                       ValOutputIt                value_result)
 {
-  typedef typename iterator_traits<ValOutputIt>::value_type value_type;
+  using value_type = typename thrust::iterator_traits<ValInputIt>::value_type;
   return cuda_cub::exclusive_scan_by_key(policy,
                                          key_first,
                                          key_last,
                                          value_first,
                                          value_result,
-                                         value_type(0));
+                                         value_type{});
 }
 
 
 }    // namespace cuda_cub
-THRUST_END_NS
+THRUST_NAMESPACE_END
 
 #include <thrust/scan.h>
 
-#endif
+#endif // NVCC
diff --git a/thrust/system/cuda/detail/scatter.h b/thrust/system/cuda/detail/scatter.h
index e3ba3d87d..e297d782d 100644
--- a/thrust/system/cuda/detail/scatter.h
+++ b/thrust/system/cuda/detail/scatter.h
@@ -26,12 +26,13 @@
  ******************************************************************************/
 #pragma once
 
+#include <thrust/detail/config.h>
 
 #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
 #include <thrust/system/cuda/detail/transform.h>
 #include <thrust/iterator/permutation_iterator.h>
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
 namespace cuda_cub {
 
 template <class Derived,
@@ -101,5 +102,5 @@ scatter_if(execution_policy<Derived>& policy,
 
 
 } // namespace cuda_cub
-THRUST_END_NS
+THRUST_NAMESPACE_END
 #endif
diff --git a/thrust/system/cuda/detail/set_operations.h b/thrust/system/cuda/detail/set_operations.h
index 43ae73d64..98bb4bb5d 100644
--- a/thrust/system/cuda/detail/set_operations.h
+++ b/thrust/system/cuda/detail/set_operations.h
@@ -26,23 +26,27 @@
  ******************************************************************************/
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
-#include <thrust/system/cuda/detail/util.h>
 
+#include <thrust/detail/alignment.h>
 #include <thrust/detail/cstdint.h>
+#include <thrust/detail/mpl/math.h>
 #include <thrust/detail/temporary_array.h>
-#include <thrust/system/cuda/detail/execution_policy.h>
-#include <thrust/system/cuda/detail/core/agent_launcher.h>
-#include <thrust/system/cuda/detail/par_to_seq.h>
-#include <thrust/system/cuda/detail/get_value.h>
+#include <thrust/distance.h>
 #include <thrust/extrema.h>
 #include <thrust/pair.h>
 #include <thrust/set_operations.h>
-#include <thrust/detail/mpl/math.h>
-#include <thrust/distance.h>
-#include <thrust/detail/alignment.h>
+#include <thrust/system/cuda/detail/cdp_dispatch.h>
+#include <thrust/system/cuda/detail/core/agent_launcher.h>
+#include <thrust/system/cuda/detail/execution_policy.h>
+#include <thrust/system/cuda/detail/get_value.h>
+#include <thrust/system/cuda/detail/par_to_seq.h>
+#include <thrust/system/cuda/detail/util.h>
 
-THRUST_BEGIN_NS
+
+THRUST_NAMESPACE_BEGIN
 
 namespace cuda_cub {
 
@@ -50,35 +54,36 @@ namespace __set_operations {
 
   template <bool UpperBound,
             class IntT,
+            class Size,
             class It,
             class T,
             class Comp>
   THRUST_DEVICE_FUNCTION void
   binary_search_iteration(It   data,
-                          int &begin,
-                          int &end,
+                          Size &begin,
+                          Size &end,
                           T    key,
                           int  shift,
                           Comp comp)
   {
 
     IntT scale = (1 << shift) - 1;
-    int  mid   = (int)((begin + scale * end) >> shift);
+    Size mid   = (begin + scale * end) >> shift;
 
     T    key2 = data[mid];
     bool pred = UpperBound ? !comp(key, key2) : comp(key2, key);
     if (pred)
-      begin = (int)mid + 1;
+      begin = mid + 1;
     else
       end = mid;
   }
 
-  template <bool UpperBound, class T, class It, class Comp>
-  THRUST_DEVICE_FUNCTION int
-  binary_search(It data, int count, T key, Comp comp)
+  template <bool UpperBound, class Size, class T, class It, class Comp>
+  THRUST_DEVICE_FUNCTION Size
+  binary_search(It data, Size count, T key, Comp comp)
   {
-    int begin = 0;
-    int end   = count;
+    Size begin = 0;
+    Size end   = count;
     while (begin < end)
       binary_search_iteration<UpperBound, int>(data,
                                                begin,
@@ -89,12 +94,12 @@ namespace __set_operations {
     return begin;
   }
 
-  template <bool UpperBound, class IntT, class T, class It, class Comp>
-  THRUST_DEVICE_FUNCTION int
-  biased_binary_search(It data, int count, T key, IntT levels, Comp comp)
+  template <bool UpperBound, class IntT, class Size, class T, class It, class Comp>
+  THRUST_DEVICE_FUNCTION Size
+  biased_binary_search(It data, Size count, T key, IntT levels, Comp comp)
   {
-    int begin = 0;
-    int end   = count;
+    Size begin = 0;
+    Size end   = count;
 
     if (levels >= 4 && begin < end)
       binary_search_iteration<UpperBound, IntT>(data, begin, end, key, 9, comp);
@@ -110,18 +115,18 @@ namespace __set_operations {
     return begin;
   }
 
-  template <bool UpperBound, class It1, class It2, class Comp>
-  THRUST_DEVICE_FUNCTION int
-  merge_path(It1 a, int aCount, It2 b, int bCount, int diag, Comp comp)
+  template <bool UpperBound, class Size, class It1, class It2, class Comp>
+  THRUST_DEVICE_FUNCTION Size
+  merge_path(It1 a, Size aCount, It2 b, Size bCount, Size diag, Comp comp)
   {
     typedef typename thrust::iterator_traits<It1>::value_type T;
 
-    int begin = thrust::max(0, diag - bCount);
-    int end   = thrust::min(diag, aCount);
+    Size begin = thrust::max<Size>(0, diag - bCount);
+    Size end   = thrust::min<Size>(diag, aCount);
 
     while (begin < end)
     {
-      int  mid  = (begin + end) >> 1;
+      Size  mid  = (begin + end) >> 1;
       T    aKey = a[mid];
       T    bKey = b[diag - 1 - mid];
       bool pred = UpperBound ? comp(aKey, bKey) : !comp(bKey, aKey);
@@ -132,9 +137,9 @@ namespace __set_operations {
     }
     return begin;
   }
-  
+
   template <class It1, class It2, class Size, class Size2, class CompareOp>
-  pair<Size, Size> THRUST_DEVICE_FUNCTION
+  THRUST_DEVICE_FUNCTION pair<Size, Size>
   balanced_path(It1       keys1,
                 It2       keys2,
                 Size      num_keys1,
@@ -202,15 +207,13 @@ namespace __set_operations {
             int                      _ITEMS_PER_THREAD = 1,
             cub::BlockLoadAlgorithm  _LOAD_ALGORITHM   = cub::BLOCK_LOAD_DIRECT,
             cub::CacheLoadModifier   _LOAD_MODIFIER    = cub::LOAD_LDG,
-            cub::BlockScanAlgorithm  _SCAN_ALGORITHM   = cub::BLOCK_SCAN_WARP_SCANS,
-            int                      _MIN_BLOCKS       = 1>
+            cub::BlockScanAlgorithm  _SCAN_ALGORITHM   = cub::BLOCK_SCAN_WARP_SCANS>
   struct PtxPolicy
   {
     enum
     {
       BLOCK_THREADS    = _BLOCK_THREADS,
       ITEMS_PER_THREAD = _ITEMS_PER_THREAD,
-      MIN_BLOCKS       = _MIN_BLOCKS,
       ITEMS_PER_TILE   = _BLOCK_THREADS * _ITEMS_PER_THREAD - 1
     };
 
@@ -221,9 +224,9 @@ namespace __set_operations {
 
   template<class Arch, class T, class U>
   struct Tuning;
-  
+
   namespace mpl = thrust::detail::mpl::math;
-  
+
   template<class T, class U>
   struct Tuning<sm30,T,U>
   {
@@ -238,9 +241,9 @@ namespace __set_operations {
           mpl::max<
               int,
               1,
-              ((NOMINAL_4B_ITEMS_PER_THREAD * 4) +
+              static_cast<int>(((NOMINAL_4B_ITEMS_PER_THREAD * 4) +
                COMBINED_INPUT_BYTES - 1) /
-                  COMBINED_INPUT_BYTES>::value>::value,
+                  COMBINED_INPUT_BYTES)>::value>::value,
     };
 
     typedef PtxPolicy<128,
@@ -265,9 +268,9 @@ namespace __set_operations {
           mpl::max<
               int,
               1,
-              ((NOMINAL_4B_ITEMS_PER_THREAD * 4) +
+              static_cast<int>(((NOMINAL_4B_ITEMS_PER_THREAD * 4) +
                COMBINED_INPUT_BYTES - 1) /
-                  COMBINED_INPUT_BYTES>::value>::value,
+                  COMBINED_INPUT_BYTES)>::value>::value,
     };
 
     typedef PtxPolicy<256,
@@ -292,9 +295,9 @@ namespace __set_operations {
           mpl::max<
               int,
               1,
-              ((NOMINAL_4B_ITEMS_PER_THREAD * 4) +
+              static_cast<int>(((NOMINAL_4B_ITEMS_PER_THREAD * 4) +
                COMBINED_INPUT_BYTES - 1) /
-                  COMBINED_INPUT_BYTES>::value>::value,
+                  COMBINED_INPUT_BYTES)>::value>::value,
     };
 
     typedef PtxPolicy<512,
@@ -324,9 +327,9 @@ namespace __set_operations {
 
     typedef key1_type  key_type;
     typedef value1_type value_type;
-    
+
     typedef cub::ScanTileState<Size> ScanTileState;
-    
+
     template <class Arch>
     struct PtxPlan : Tuning<Arch, key_type, value_type>::type
     {
@@ -360,18 +363,18 @@ namespace __set_operations {
       //
       union TempStorage
       {
-        struct
+        struct ScanStorage
         {
           typename BlockScan::TempStorage          scan;
           typename TilePrefixCallback::TempStorage prefix;
-        };
+        } scan_storage;
 
-        struct
+        struct LoadStorage
         {
-          core::uninitialized_array<int, PtxPlan::BLOCK_THREADS>
-              offset;
+          core::uninitialized_array<int, PtxPlan::BLOCK_THREADS> offset;
           union
           {
+            // FIXME These don't appear to be used anywhere?
             typename BlockLoadKeys1::TempStorage   load_keys1;
             typename BlockLoadKeys2::TempStorage   load_keys2;
             typename BlockLoadValues1::TempStorage load_values1;
@@ -389,8 +392,8 @@ namespace __set_operations {
                 value_type,
                 PtxPlan::ITEMS_PER_TILE + PtxPlan::BLOCK_THREADS>
                 values_shared;
-          };
-        };
+          }; // anon union
+        } load_storage; // struct LoadStorage
       };    // union TempStorage
     };      // struct PtxPlan
 
@@ -436,7 +439,7 @@ namespace __set_operations {
       CompareOp      compare_op;
       SetOp          set_op;
       pair<Size, Size> *partitions;
-      Size *output_count;
+      std::size_t *output_count;
 
       //---------------------------------------------------------------------
       // Utility functions
@@ -498,7 +501,7 @@ namespace __set_operations {
           output[idx] = input[ITEM];
         }
       }
-      
+
       template <class OutputIt, class T, class SharedIt>
       void THRUST_DEVICE_FUNCTION
       scatter(OutputIt output,
@@ -510,7 +513,7 @@ namespace __set_operations {
               int      tile_output_count)
       {
         using core::sync_threadblock;
-        
+
 
 
         int local_scatter_idx = thread_output_prefix - tile_output_prefix;
@@ -578,9 +581,9 @@ namespace __set_operations {
         //
         int num_keys1 = static_cast<int>(keys1_end - keys1_beg);
         int num_keys2 = static_cast<int>(keys2_end - keys2_beg);
-        
-       
-       // load keys into shared memory for further processing 
+
+
+       // load keys into shared memory for further processing
         key_type keys_loc[ITEMS_PER_THREAD];
 
         gmem_to_reg<!IS_LAST_TILE>(keys_loc,
@@ -588,8 +591,8 @@ namespace __set_operations {
                                    keys2_in + keys2_beg,
                                    num_keys1,
                                    num_keys2);
-        
-        reg_to_shared(&storage.keys_shared[0], keys_loc);
+
+        reg_to_shared(&storage.load_storage.keys_shared[0], keys_loc);
 
         sync_threadblock();
 
@@ -597,14 +600,14 @@ namespace __set_operations {
                                 num_keys1 + num_keys2);
 
         pair<int, int> partition_loc =
-            balanced_path(&storage.keys_shared[0],
-                          &storage.keys_shared[num_keys1],
+            balanced_path(&storage.load_storage.keys_shared[0],
+                          &storage.load_storage.keys_shared[num_keys1],
                           num_keys1,
                           num_keys2,
                           diag_loc,
                           4,
                           compare_op);
-        
+
         int keys1_beg_loc = partition_loc.first;
         int keys2_beg_loc = partition_loc.second;
 
@@ -615,25 +618,25 @@ namespace __set_operations {
                         : (partition_loc.first << 16) | partition_loc.second;
 
         int dst = threadIdx.x == 0 ? BLOCK_THREADS - 1 : threadIdx.x - 1;
-        storage.offset[dst] = value;
+        storage.load_storage.offset[dst] = value;
 
         core::sync_threadblock();
 
         pair<int,int> partition1_loc = thrust::make_pair(
-          storage.offset[threadIdx.x] >> 16,
-          storage.offset[threadIdx.x] & 0xFFFF);
+          storage.load_storage.offset[threadIdx.x] >> 16,
+          storage.load_storage.offset[threadIdx.x] & 0xFFFF);
 
         int keys1_end_loc = partition1_loc.first;
         int keys2_end_loc = partition1_loc.second;
 
         int num_keys1_loc = keys1_end_loc - keys1_beg_loc;
         int num_keys2_loc = keys2_end_loc - keys2_beg_loc;
-        
+
         // perform serial set operation
         //
         int indices[ITEMS_PER_THREAD];
 
-        int active_mask = serial_set_op(&storage.keys_shared[0],
+        int active_mask = serial_set_op(&storage.load_storage.keys_shared[0],
                                         keys1_beg_loc,
                                         keys2_beg_loc + num_keys1,
                                         num_keys1_loc,
@@ -657,7 +660,7 @@ namespace __set_operations {
 
         if (tile_idx == 0)    // first tile
         {
-          BlockScan(storage.scan)
+          BlockScan(storage.scan_storage.scan)
               .ExclusiveSum(thread_output_count,
                             thread_output_prefix,
                             tile_output_count);
@@ -673,11 +676,11 @@ namespace __set_operations {
         else
         {
           TilePrefixCallback prefix_cb(tile_state,
-                                       storage.prefix,
+                                       storage.scan_storage.prefix,
                                        cub::Sum(),
                                        tile_idx);
 
-          BlockScan(storage.scan)
+          BlockScan(storage.scan_storage.scan)
               .ExclusiveSum(thread_output_count,
                             thread_output_prefix,
                             prefix_cb);
@@ -691,7 +694,7 @@ namespace __set_operations {
         //
         scatter(keys_out,
                 keys_loc,
-                &storage.keys_shared[0],
+                &storage.load_storage.keys_shared[0],
                 active_mask,
                 thread_output_prefix,
                 tile_output_prefix,
@@ -708,7 +711,7 @@ namespace __set_operations {
 
           sync_threadblock();
 
-          reg_to_shared(&storage.values_shared[0], values_loc);
+          reg_to_shared(&storage.load_storage.values_shared[0], values_loc);
 
           sync_threadblock();
 
@@ -719,7 +722,7 @@ namespace __set_operations {
           {
             if (active_mask & (1 << ITEM))
             {
-              values_loc[ITEM] = storage.values_shared[indices[ITEM]];
+              values_loc[ITEM] = storage.load_storage.values_shared[indices[ITEM]];
             }
           }
 
@@ -727,7 +730,7 @@ namespace __set_operations {
 
           scatter(values_out,
                   values_loc,
-                  &storage.values_shared[0],
+                  &storage.load_storage.values_shared[0],
                   active_mask,
                   thread_output_prefix,
                   tile_output_prefix,
@@ -758,7 +761,7 @@ namespace __set_operations {
            CompareOp      compare_op_,
            SetOp          set_op_,
            pair<Size, Size> *partitions_,
-           Size *output_count_)
+           std::size_t * output_count_)
           : storage(storage_),
             tile_state(tile_state_),
             keys1_in(core::make_load_iterator(ptx_plan(), keys1_)),
@@ -772,7 +775,7 @@ namespace __set_operations {
             compare_op(compare_op_),
             set_op(set_op_),
             partitions(partitions_),
-            output_count(output_count_) 
+            output_count(output_count_)
       {
         int  tile_idx      = blockIdx.x;
         int  num_tiles     = gridDim.x;
@@ -781,7 +784,7 @@ namespace __set_operations {
         {
           consume_tile<false>(tile_idx);
         }
-        else 
+        else
         {
           consume_tile<true>(tile_idx);
         }
@@ -803,7 +806,7 @@ namespace __set_operations {
                        CompareOp      compare_op,
                        SetOp          set_op,
                        pair<Size, Size> *partitions,
-                       Size *        output_count,
+                       std::size_t *  output_count,
                        ScanTileState tile_state,
                        char *        shmem)
     {
@@ -825,7 +828,7 @@ namespace __set_operations {
            output_count);
     }
   };    // struct SetOpAgent
-  
+
   template <class KeysIt1,
             class KeysIt2,
             class Size,
@@ -867,7 +870,7 @@ namespace __set_operations {
       }
     }
   };    // struct PartitionAgent
-  
+
   template <class ScanTileState,
             class Size>
   struct InitAgent
@@ -939,7 +942,7 @@ namespace __set_operations {
       return active_mask;
     }
   };    // struct serial_set_intersection
-  
+
   // serial_set_symmetric_difference
   // ---------------------
   // emit A if A < B and emit B if B < A.
@@ -984,8 +987,8 @@ namespace __set_operations {
         // The outputs must come from A by definition of set difference.
         output[i]  = pA ? aKey : bKey;
         indices[i] = pA ? aBegin : bBegin;
-        
-        if (aBegin + bBegin < end && pA != pB) 
+
+        if (aBegin + bBegin < end && pA != pB)
           active_mask |= 1 << i;
 
         if (!pB) {aKey = keys[++aBegin]; }
@@ -1039,7 +1042,7 @@ namespace __set_operations {
         // The outputs must come from A by definition of set difference.
         output[i]  = aKey;
         indices[i] = aBegin;
-        
+
         if (aBegin + bBegin < end && pA)
           active_mask |= 1 << i;
 
@@ -1049,7 +1052,7 @@ namespace __set_operations {
       return active_mask;
     }
   };    // struct set_difference
-  
+
   // serial_set_union
   // ----------------
   // emit A if A <= B else emit B
@@ -1093,7 +1096,7 @@ namespace __set_operations {
         // Output A in case of a tie, so check if b < a.
         output[i]  = pB ? bKey : aKey;
         indices[i] = pB ? bBegin : aBegin;
-        
+
         if (aBegin + bBegin < end)
           active_mask |= 1 << i;
 
@@ -1126,18 +1129,17 @@ namespace __set_operations {
             Size           num_keys2,
             KeysOutputIt   keys_output,
             ValuesOutputIt values_output,
-            Size *         output_count,
+            std::size_t *  output_count,
             CompareOp      compare_op,
             SetOp          set_op,
-            cudaStream_t   stream,
-            bool           debug_sync)
+            cudaStream_t   stream)
   {
     Size keys_total = num_keys1 + num_keys2;
     if (keys_total == 0)
       return cudaErrorNotSupported;
 
     cudaError_t status = cudaSuccess;
-    
+
     using core::AgentPlan;
     using core::AgentLauncher;
 
@@ -1156,7 +1158,7 @@ namespace __set_operations {
 
     typedef AgentLauncher<PartitionAgent<KeysIt1, KeysIt2, Size, CompareOp> >
         partition_agent;
-    
+
     typedef typename set_op_agent::ScanTileState ScanTileState;
     typedef AgentLauncher<InitAgent<ScanTileState, Size> > init_agent;
 
@@ -1169,7 +1171,8 @@ namespace __set_operations {
     Size num_tiles = (keys_total + tile_size - 1) / tile_size;
 
     size_t tile_agent_storage;
-    status = ScanTileState::AllocationSize(static_cast<int>(num_tiles), tile_agent_storage);
+    status = ScanTileState::AllocationSize(static_cast<int>(num_tiles),
+                                           tile_agent_storage);
     CUDA_CUB_RET_IF_FAIL(status);
 
     size_t vshmem_storage = core::vshmem_size(set_op_plan.shared_memory_size,
@@ -1193,17 +1196,19 @@ namespace __set_operations {
     }
 
     ScanTileState tile_state;
-    status = tile_state.Init(static_cast<int>(num_tiles), allocations[0], allocation_sizes[0]);
+    status = tile_state.Init(static_cast<int>(num_tiles),
+                             allocations[0],
+                             allocation_sizes[0]);
     CUDA_CUB_RET_IF_FAIL(status);
 
     pair<Size, Size> *partitions = (pair<Size, Size> *)allocations[1];
     char *vshmem_ptr = vshmem_storage > 0 ? (char *)allocations[2] : NULL;
 
-    init_agent ia(init_plan, num_tiles, stream, "set_op::init_agent", debug_sync);
+    init_agent ia(init_plan, num_tiles, stream, "set_op::init_agent");
     ia.launch(tile_state, num_tiles);
     CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
 
-    partition_agent pa(partition_plan, num_tiles+1, stream, "set_op::partition agent", debug_sync);
+    partition_agent pa(partition_plan, num_tiles+1, stream, "set_op::partition agent");
     pa.launch(keys1,
               keys2,
               num_keys1,
@@ -1214,7 +1219,7 @@ namespace __set_operations {
               tile_size);
     CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
 
-    set_op_agent sa(set_op_plan, keys_total, stream, vshmem_ptr, "set_op::set_op_agent", debug_sync);
+    set_op_agent sa(set_op_plan, keys_total, stream, vshmem_ptr, "set_op::set_op_agent");
     sa.launch(keys1,
               keys2,
               values1,
@@ -1264,30 +1269,29 @@ namespace __set_operations {
 
     if (num_keys1 + num_keys2 == 0)
       return thrust::make_pair(keys_output, values_output);
-     
+
     size_t       temp_storage_bytes = 0;
     cudaStream_t stream             = cuda_cub::stream(policy);
-    bool         debug_sync         = THRUST_DEBUG_SYNC_FLAG;
 
     cudaError_t status;
-    status = doit_step<HAS_VALUES>(NULL,
+    THRUST_DOUBLE_INDEX_TYPE_DISPATCH(status, doit_step<HAS_VALUES>,
+        num_keys1, num_keys2, (NULL,
                                    temp_storage_bytes,
                                    keys1_first,
                                    keys2_first,
                                    values1_first,
                                    values2_first,
-                                   num_keys1,
-                                   num_keys2,
+                                   num_keys1_fixed,
+                                   num_keys2_fixed,
                                    keys_output,
                                    values_output,
-                                   reinterpret_cast<size_type*>(NULL),
+                                   reinterpret_cast<std::size_t*>(NULL),
                                    compare_op,
                                    set_op,
-                                   stream,
-                                   debug_sync);
+                                   stream));
     cuda_cub::throw_on_error(status, "set_operations failed on 1st step");
 
-    size_t allocation_sizes[2] = {sizeof(size_type), temp_storage_bytes};
+    size_t allocation_sizes[2] = {sizeof(std::size_t), temp_storage_bytes};
     void * allocations[2]      = {NULL, NULL};
 
     size_t storage_size = 0;
@@ -1309,30 +1313,30 @@ namespace __set_operations {
                                  allocation_sizes);
     cuda_cub::throw_on_error(status, "set_operations failed on 2nd alias_storage");
 
-    size_type* d_output_count
-      = thrust::detail::aligned_reinterpret_cast<size_type*>(allocations[0]);
+    std::size_t* d_output_count
+      = thrust::detail::aligned_reinterpret_cast<std::size_t*>(allocations[0]);
 
-    status = doit_step<HAS_VALUES>(allocations[1],
+    THRUST_DOUBLE_INDEX_TYPE_DISPATCH(status, doit_step<HAS_VALUES>,
+        num_keys1, num_keys2, (allocations[1],
                                    temp_storage_bytes,
                                    keys1_first,
                                    keys2_first,
                                    values1_first,
                                    values2_first,
-                                   num_keys1,
-                                   num_keys2,
+                                   num_keys1_fixed,
+                                   num_keys2_fixed,
                                    keys_output,
                                    values_output,
                                    d_output_count,
                                    compare_op,
                                    set_op,
-                                   stream,
-                                   debug_sync);
+                                   stream));
     cuda_cub::throw_on_error(status, "set_operations failed on 2nd step");
-    
+
     status = cuda_cub::synchronize(policy);
     cuda_cub::throw_on_error(status, "set_operations failed to synchronize");
 
-    size_type output_count = cuda_cub::get_value(policy, d_output_count);
+    std::size_t output_count = cuda_cub::get_value(policy, d_output_count);
 
     return thrust::make_pair(keys_output + output_count, values_output + output_count);
   }
@@ -1357,38 +1361,30 @@ set_difference(execution_policy<Derived> &policy,
                OutputIt                   result,
                CompareOp                  compare)
 {
-  OutputIt ret = result;
-  if (__THRUST_HAS_CUDART__)
-  {
-    typename thrust::iterator_value<ItemsIt1>::type *null_ = NULL;
-    //
-    ret = __set_operations::set_operations<thrust::detail::false_type>(
-              policy,
-              items1_first,
-              items1_last,
-              items2_first,
-              items2_last,
-              null_,
-              null_,
-              result,
-              null_,
-              compare,
-              __set_operations::serial_set_difference())
-              .first;
-  }
-  else
-  {
-#if !__THRUST_HAS_CUDART__
-    ret = thrust::set_difference(cvt_to_seq(derived_cast(policy)),
-                                 items1_first,
-                                 items1_last,
-                                 items2_first,
-                                 items2_last,
-                                 result,
-                                 compare);
-#endif
-  }
-  return ret;
+  THRUST_CDP_DISPATCH(
+    (using items1_t  = thrust::iterator_value_t<ItemsIt1>;
+     items1_t *null_ = nullptr;
+     auto tmp = __set_operations::set_operations<thrust::detail::false_type>(
+       policy,
+       items1_first,
+       items1_last,
+       items2_first,
+       items2_last,
+       null_,
+       null_,
+       result,
+       null_,
+       compare,
+       __set_operations::serial_set_difference());
+     result = tmp.first;),
+    (result = thrust::set_difference(cvt_to_seq(derived_cast(policy)),
+                                     items1_first,
+                                     items1_last,
+                                     items2_first,
+                                     items2_last,
+                                     result,
+                                     compare);));
+  return result;
 }
 
 template <class Derived,
@@ -1431,38 +1427,30 @@ set_intersection(execution_policy<Derived> &policy,
                  OutputIt                   result,
                  CompareOp                  compare)
 {
-  OutputIt ret = result;
-  if (__THRUST_HAS_CUDART__)
-  {
-    typename thrust::iterator_value<ItemsIt1>::type *null_ = NULL;
-    //
-    ret = __set_operations::set_operations<thrust::detail::false_type>(
-              policy,
-              items1_first,
-              items1_last,
-              items2_first,
-              items2_last,
-              null_,
-              null_,
-              result,
-              null_,
-              compare,
-              __set_operations::serial_set_intersection())
-              .first;
-  }
-  else
-  {
-#if !__THRUST_HAS_CUDART__
-    ret = thrust::set_intersection(cvt_to_seq(derived_cast(policy)),
-                                   items1_first,
-                                   items1_last,
-                                   items2_first,
-                                   items2_last,
-                                   result,
-                                   compare);
-#endif
-  }
-  return ret;
+  THRUST_CDP_DISPATCH(
+    (using items1_t  = thrust::iterator_value_t<ItemsIt1>;
+     items1_t *null_ = NULL;
+     auto tmp = __set_operations::set_operations<thrust::detail::false_type>(
+       policy,
+       items1_first,
+       items1_last,
+       items2_first,
+       items2_last,
+       null_,
+       null_,
+       result,
+       null_,
+       compare,
+       __set_operations::serial_set_intersection());
+     result = tmp.first;),
+    (result = thrust::set_intersection(cvt_to_seq(derived_cast(policy)),
+                                       items1_first,
+                                       items1_last,
+                                       items2_first,
+                                       items2_last,
+                                       result,
+                                       compare);));
+  return result;
 }
 
 template <class Derived,
@@ -1505,41 +1493,32 @@ set_symmetric_difference(execution_policy<Derived> &policy,
                          OutputIt                   result,
                          CompareOp                  compare)
 {
-  OutputIt ret = result;
-  if (__THRUST_HAS_CUDART__)
-  {
-    typename thrust::iterator_value<ItemsIt1>::type *null_ = NULL;
-    //
-    ret = __set_operations::set_operations<thrust::detail::false_type>(
-              policy,
-              items1_first,
-              items1_last,
-              items2_first,
-              items2_last,
-              null_,
-              null_,
-              result,
-              null_,
-              compare,
-              __set_operations::serial_set_symmetric_difference())
-              .first;
-  }
-  else
-  {
-#if !__THRUST_HAS_CUDART__
-    ret = thrust::set_symmetric_difference(cvt_to_seq(derived_cast(policy)),
-                                           items1_first,
-                                           items1_last,
-                                           items2_first,
-                                           items2_last,
-                                           result,
-                                           compare);
-#endif
-  }
-  return ret;
+  THRUST_CDP_DISPATCH(
+    (using items1_t  = thrust::iterator_value_t<ItemsIt1>;
+     items1_t *null_ = nullptr;
+     auto tmp = __set_operations::set_operations<thrust::detail::false_type>(
+       policy,
+       items1_first,
+       items1_last,
+       items2_first,
+       items2_last,
+       null_,
+       null_,
+       result,
+       null_,
+       compare,
+       __set_operations::serial_set_symmetric_difference());
+     result = tmp.first;),
+    (result = thrust::set_symmetric_difference(cvt_to_seq(derived_cast(policy)),
+                                               items1_first,
+                                               items1_last,
+                                               items2_first,
+                                               items2_last,
+                                               result,
+                                               compare);));
+  return result;
 }
 
-
 template <class Derived,
           class ItemsIt1,
           class ItemsIt2,
@@ -1579,41 +1558,32 @@ set_union(execution_policy<Derived> &policy,
           OutputIt                   result,
           CompareOp                  compare)
 {
-  OutputIt ret = result;
-  if (__THRUST_HAS_CUDART__)
-  {
-    typename thrust::iterator_value<ItemsIt1>::type *null_ = NULL;
-    //
-    ret = __set_operations::set_operations<thrust::detail::false_type>(
-              policy,
-              items1_first,
-              items1_last,
-              items2_first,
-              items2_last,
-              null_,
-              null_,
-              result,
-              null_,
-              compare,
-              __set_operations::serial_set_union())
-              .first;
-  }
-  else
-  {
-#if !__THRUST_HAS_CUDART__
-    ret = thrust::set_union(cvt_to_seq(derived_cast(policy)),
-                            items1_first,
-                            items1_last,
-                            items2_first,
-                            items2_last,
-                            result,
-                            compare);
-#endif
-  }
-  return ret;
+  THRUST_CDP_DISPATCH(
+    (using items1_t  = thrust::iterator_value_t<ItemsIt1>;
+     items1_t *null_ = nullptr;
+     auto tmp = __set_operations::set_operations<thrust::detail::false_type>(
+       policy,
+       items1_first,
+       items1_last,
+       items2_first,
+       items2_last,
+       null_,
+       null_,
+       result,
+       null_,
+       compare,
+       __set_operations::serial_set_union());
+     result = tmp.first;),
+    (result = thrust::set_union(cvt_to_seq(derived_cast(policy)),
+                                items1_first,
+                                items1_last,
+                                items2_first,
+                                items2_last,
+                                result,
+                                compare);));
+  return result;
 }
 
-
 template <class Derived,
           class ItemsIt1,
           class ItemsIt2,
@@ -1666,37 +1636,30 @@ set_difference_by_key(execution_policy<Derived> &policy,
                       ItemsOutputIt              items_result,
                       CompareOp                  compare_op)
 {
-  pair<KeysOutputIt, ItemsOutputIt> ret = thrust::make_pair(keys_result, items_result);
-  if (__THRUST_HAS_CUDART__)
-  {
-    ret = __set_operations::set_operations<thrust::detail::true_type>(
-        policy,
-        keys1_first,
-        keys1_last,
-        keys2_first,
-        keys2_last,
-        items1_first,
-        items2_first,
-        keys_result,
-        items_result,
-        compare_op,
-        __set_operations::serial_set_difference());
-  }
-  else
-  {
-#if !__THRUST_HAS_CUDART__
-    ret = thrust::set_difference_by_key(cvt_to_seq(derived_cast(policy)),
-                                        keys1_first,
-                                        keys1_last,
-                                        keys2_first,
-                                        keys2_last,
-                                        items1_first,
-                                        items2_first,
-                                        keys_result,
-                                        items_result,
-                                        compare_op);
-#endif
-  }
+  auto ret = thrust::make_pair(keys_result, items_result);
+  THRUST_CDP_DISPATCH(
+    (ret = __set_operations::set_operations<thrust::detail::true_type>(
+       policy,
+       keys1_first,
+       keys1_last,
+       keys2_first,
+       keys2_last,
+       items1_first,
+       items2_first,
+       keys_result,
+       items_result,
+       compare_op,
+       __set_operations::serial_set_difference());),
+    (ret = thrust::set_difference_by_key(cvt_to_seq(derived_cast(policy)),
+                                         keys1_first,
+                                         keys1_last,
+                                         keys2_first,
+                                         keys2_last,
+                                         items1_first,
+                                         items2_first,
+                                         keys_result,
+                                         items_result,
+                                         compare_op);));
   return ret;
 }
 
@@ -1753,36 +1716,29 @@ set_intersection_by_key(execution_policy<Derived> &policy,
                         ItemsOutputIt              items_result,
                         CompareOp                  compare_op)
 {
-  pair<KeysOutputIt, ItemsOutputIt> ret = thrust::make_pair(keys_result, items_result);
-  if (__THRUST_HAS_CUDART__)
-  {
-    ret = __set_operations::set_operations<thrust::detail::true_type>(
-        policy,
-        keys1_first,
-        keys1_last,
-        keys2_first,
-        keys2_last,
-        items1_first,
-        items1_first,
-        keys_result,
-        items_result,
-        compare_op,
-        __set_operations::serial_set_intersection());
-  }
-  else
-  {
-#if !__THRUST_HAS_CUDART__
-    ret = thrust::set_intersection_by_key(cvt_to_seq(derived_cast(policy)),
-                                          keys1_first,
-                                          keys1_last,
-                                          keys2_first,
-                                          keys2_last,
-                                          items1_first,
-                                          keys_result,
-                                          items_result,
-                                          compare_op);
-#endif
-  }
+  auto ret = thrust::make_pair(keys_result, items_result);
+  THRUST_CDP_DISPATCH(
+    (ret = __set_operations::set_operations<thrust::detail::true_type>(
+       policy,
+       keys1_first,
+       keys1_last,
+       keys2_first,
+       keys2_last,
+       items1_first,
+       items1_first,
+       keys_result,
+       items_result,
+       compare_op,
+       __set_operations::serial_set_intersection());),
+    (ret = thrust::set_intersection_by_key(cvt_to_seq(derived_cast(policy)),
+                                           keys1_first,
+                                           keys1_last,
+                                           keys2_first,
+                                           keys2_last,
+                                           items1_first,
+                                           keys_result,
+                                           items_result,
+                                           compare_op);));
   return ret;
 }
 
@@ -1838,37 +1794,31 @@ set_symmetric_difference_by_key(execution_policy<Derived> &policy,
                                 ItemsOutputIt              items_result,
                                 CompareOp                  compare_op)
 {
-  pair<KeysOutputIt, ItemsOutputIt> ret = thrust::make_pair(keys_result, items_result);
-  if (__THRUST_HAS_CUDART__)
-  {
-    ret = __set_operations::set_operations<thrust::detail::true_type>(
-        policy,
-        keys1_first,
-        keys1_last,
-        keys2_first,
-        keys2_last,
-        items1_first,
-        items2_first,
-        keys_result,
-        items_result,
-        compare_op,
-        __set_operations::serial_set_symmetric_difference());
-  }
-  else
-  {
-#if !__THRUST_HAS_CUDART__
-    ret = thrust::set_symmetric_difference_by_key(cvt_to_seq(derived_cast(policy)),
-                                                  keys1_first,
-                                                  keys1_last,
-                                                  keys2_first,
-                                                  keys2_last,
-                                                  items1_first,
-                                                  items2_first,
-                                                  keys_result,
-                                                  items_result,
-                                                  compare_op);
-#endif
-  }
+  auto ret = thrust::make_pair(keys_result, items_result);
+  THRUST_CDP_DISPATCH(
+    (ret = __set_operations::set_operations<thrust::detail::true_type>(
+       policy,
+       keys1_first,
+       keys1_last,
+       keys2_first,
+       keys2_last,
+       items1_first,
+       items2_first,
+       keys_result,
+       items_result,
+       compare_op,
+       __set_operations::serial_set_symmetric_difference());),
+    (ret =
+       thrust::set_symmetric_difference_by_key(cvt_to_seq(derived_cast(policy)),
+                                               keys1_first,
+                                               keys1_last,
+                                               keys2_first,
+                                               keys2_last,
+                                               items1_first,
+                                               items2_first,
+                                               keys_result,
+                                               items_result,
+                                               compare_op);));
   return ret;
 }
 
@@ -1926,37 +1876,30 @@ set_union_by_key(execution_policy<Derived> &policy,
                  ItemsOutputIt              items_result,
                  CompareOp                  compare_op)
 {
-  pair<KeysOutputIt, ItemsOutputIt> ret = thrust::make_pair(keys_result, items_result);
-  if (__THRUST_HAS_CUDART__)
-  {
-    ret = __set_operations::set_operations<thrust::detail::true_type>(
-        policy,
-        keys1_first,
-        keys1_last,
-        keys2_first,
-        keys2_last,
-        items1_first,
-        items2_first,
-        keys_result,
-        items_result,
-        compare_op,
-        __set_operations::serial_set_union());
-  }
-  else
-  {
-#if !__THRUST_HAS_CUDART__
-    ret = thrust::set_union_by_key(cvt_to_seq(derived_cast(policy)),
-                                   keys1_first,
-                                   keys1_last,
-                                   keys2_first,
-                                   keys2_last,
-                                   items1_first,
-                                   items2_first,
-                                   keys_result,
-                                   items_result,
-                                   compare_op);
-#endif
-  }
+  auto ret = thrust::make_pair(keys_result, items_result);
+  THRUST_CDP_DISPATCH(
+    (ret = __set_operations::set_operations<thrust::detail::true_type>(
+       policy,
+       keys1_first,
+       keys1_last,
+       keys2_first,
+       keys2_last,
+       items1_first,
+       items2_first,
+       keys_result,
+       items_result,
+       compare_op,
+       __set_operations::serial_set_union());),
+    (ret = thrust::set_union_by_key(cvt_to_seq(derived_cast(policy)),
+                                    keys1_first,
+                                    keys1_last,
+                                    keys2_first,
+                                    keys2_last,
+                                    items1_first,
+                                    items2_first,
+                                    keys_result,
+                                    items_result,
+                                    compare_op);));
   return ret;
 }
 
@@ -1992,5 +1935,5 @@ set_union_by_key(execution_policy<Derived> &policy,
 }
 
 }    // namespace cuda_cub
-THRUST_END_NS
+THRUST_NAMESPACE_END
 #endif
diff --git a/thrust/system/cuda/detail/sort.h b/thrust/system/cuda/detail/sort.h
index 3f351f966..db4c211b3 100644
--- a/thrust/system/cuda/detail/sort.h
+++ b/thrust/system/cuda/detail/sort.h
@@ -26,1243 +26,132 @@
  ******************************************************************************/
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
-#include <thrust/detail/cstdint.h>
-#include <thrust/detail/temporary_array.h>
-#include <thrust/system/cuda/detail/util.h>
+#include <thrust/distance.h>
+#include <thrust/extrema.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+
 #include <thrust/system/cuda/config.h>
+#include <thrust/system/cuda/detail/cdp_dispatch.h>
 #include <thrust/system/cuda/detail/core/agent_launcher.h>
 #include <thrust/system/cuda/detail/core/util.h>
-#include <thrust/system/cuda/detail/cub/device/device_radix_sort.cuh>
-
 #include <thrust/system/cuda/detail/execution_policy.h>
 #include <thrust/system/cuda/detail/par_to_seq.h>
-#include <thrust/detail/trivial_sequence.h>
-#include <thrust/detail/integer_math.h>
-#include <thrust/extrema.h>
-#include <thrust/sort.h>
-#include <thrust/distance.h>
-#include <thrust/sequence.h>
+#include <thrust/system/cuda/detail/util.h>
+
 #include <thrust/detail/alignment.h>
+#include <thrust/detail/cstdint.h>
+#include <thrust/detail/integer_math.h>
+#include <thrust/detail/temporary_array.h>
+#include <thrust/detail/trivial_sequence.h>
+
 #include <thrust/type_traits/is_contiguous_iterator.h>
 
-THRUST_BEGIN_NS
+#include <cub/device/device_radix_sort.cuh>
+#include <cub/device/device_merge_sort.cuh>
+
+THRUST_NAMESPACE_BEGIN
 namespace cuda_cub {
 
 namespace __merge_sort {
 
-  template <class KeysIt1,
-            class KeysIt2,
-            class Size,
-            class BinaryPred>
-  THRUST_DEVICE_FUNCTION Size 
-  merge_path(KeysIt1    keys1,
-             KeysIt2    keys2,
-             Size       keys1_count,
-             Size       keys2_count,
-             Size       diag,
-             BinaryPred binary_pred)
-  {
-    typedef typename iterator_traits<KeysIt1>::value_type key1_type;
-    typedef typename iterator_traits<KeysIt2>::value_type key2_type;
-
-    Size keys1_begin = thrust::max<Size>(0, diag - keys2_count);
-    Size keys1_end   = thrust::min<Size>(diag, keys1_count);
-
-    while (keys1_begin < keys1_end)
-    {
-      Size      mid  = (keys1_begin + keys1_end) >> 1;
-      key1_type key1 = keys1[mid];
-      key2_type key2 = keys2[diag - 1 - mid];
-      bool      pred = binary_pred(key2, key1);
-      if (pred)
-      {
-        keys1_end = mid;
-      }
-      else
-      {
-        keys1_begin = mid + 1;
-      }
-    }
-    return keys1_begin;
-  }
-
-  template <class It, class T2, class CompareOp, int ITEMS_PER_THREAD>
-  THRUST_DEVICE_FUNCTION void 
-  serial_merge(It  keys_shared,
-               int keys1_beg,
-               int keys2_beg,
-               int keys1_count,
-               int keys2_count,
-               T2 (&output)[ITEMS_PER_THREAD],
-               int (&indices)[ITEMS_PER_THREAD],
-               CompareOp compare_op)
-  {
-    int keys1_end = keys1_beg + keys1_count;
-    int keys2_end = keys2_beg + keys2_count;
-    
-    typedef typename iterator_value<It>::type key_type;
-
-    key_type key1 = keys_shared[keys1_beg];
-    key_type key2 = keys_shared[keys2_beg];
-
-
-#pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-    {
-      bool p = (keys2_beg < keys2_end) &&
-               ((keys1_beg >= keys1_end) ||
-                compare_op(key2,key1));
-
-      output[ITEM]  = p ? key2 : key1;
-      indices[ITEM] = p ? keys2_beg++ : keys1_beg++;
-
-      if (p)
-      {
-        key2 = keys_shared[keys2_beg];
-      }
-      else
-      {
-        key1 = keys_shared[keys1_beg];
-      }
-    }
-  }
-
-  template <int                      _BLOCK_THREADS,
-            int                      _ITEMS_PER_THREAD = 1,
-            cub::BlockLoadAlgorithm  _LOAD_ALGORITHM   = cub::BLOCK_LOAD_DIRECT,
-            cub::CacheLoadModifier   _LOAD_MODIFIER    = cub::LOAD_LDG,
-            cub::BlockStoreAlgorithm _STORE_ALGORITHM  = cub::BLOCK_STORE_DIRECT,
-            int                      _MIN_BLOCKS       = 1>
-  struct PtxPolicy
-  {
-    enum
-    {
-      BLOCK_THREADS      = _BLOCK_THREADS,
-      ITEMS_PER_THREAD   = _ITEMS_PER_THREAD,
-      MIN_BLOCKS         = _MIN_BLOCKS,
-      ITEMS_PER_TILE     = _BLOCK_THREADS * _ITEMS_PER_THREAD,
-    };
-
-    static const cub::BlockLoadAlgorithm  LOAD_ALGORITHM  = _LOAD_ALGORITHM;
-    static const cub::CacheLoadModifier   LOAD_MODIFIER   = _LOAD_MODIFIER;
-    static const cub::BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM;
-  }; // PtxPolicy
-
-
-  template<class Arch, class T>
-  struct Tuning;
-
-  template<class T>
-  struct Tuning<sm35,T>
-  {
-    const static int INPUT_SIZE = sizeof(T);
-
-    enum
-    {
-      NOMINAL_4B_ITEMS_PER_THREAD = 11,
-      ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
-    };
-
-    typedef PtxPolicy<256,
-                      ITEMS_PER_THREAD,
-                      cub::BLOCK_LOAD_WARP_TRANSPOSE,
-                      cub::LOAD_LDG,
-                      cub::BLOCK_STORE_WARP_TRANSPOSE>
-        type;
-  };
-
-  template<class T>
-  struct Tuning<sm52,T>
-  {
-    const static int INPUT_SIZE = sizeof(T);
-
-    enum
-    {
-      NOMINAL_4B_ITEMS_PER_THREAD = 15,
-      ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
-    };
-
-    typedef PtxPolicy<512,
-                      ITEMS_PER_THREAD,
-                      cub::BLOCK_LOAD_WARP_TRANSPOSE,
-                      cub::LOAD_LDG,
-                      cub::BLOCK_STORE_WARP_TRANSPOSE>
-        type;
-  };
-
-  template<class T>
-  struct Tuning<sm60,T>
-  {
-    const static int INPUT_SIZE = sizeof(T);
-
-    enum
-    {
-      NOMINAL_4B_ITEMS_PER_THREAD = 17,
-      ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
-    };
-
-    typedef PtxPolicy<256,
-                      ITEMS_PER_THREAD,
-                      cub::BLOCK_LOAD_WARP_TRANSPOSE,
-                      cub::LOAD_DEFAULT,
-                      cub::BLOCK_STORE_WARP_TRANSPOSE>
-        type;
-  };
-
-  template<class T>  
-  struct Tuning<sm30,T>
-  {
-    enum
-    {
-      NOMINAL_4B_ITEMS_PER_THREAD = 7,
-      ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
-    };
-
-    typedef PtxPolicy<128,
-                      ITEMS_PER_THREAD,
-                      cub::BLOCK_LOAD_WARP_TRANSPOSE,
-                      cub::LOAD_DEFAULT,
-                      cub::BLOCK_STORE_WARP_TRANSPOSE>
-        type;
-  };
-  
   template <class KeysIt,
             class ItemsIt,
-            class Size,
-            class CompareOp,
-            class SORT_ITEMS,
-            class STABLE>
-  struct BlockSortAgent
-  {
-    typedef typename iterator_traits<KeysIt>::value_type key_type;
-    typedef typename iterator_traits<ItemsIt>::value_type item_type;
-
-    template <class Arch>
-    struct PtxPlan : Tuning<Arch, key_type>::type
-    {
-      typedef Tuning<Arch,key_type> tuning;
-
-      typedef typename core::LoadIterator<PtxPlan, KeysIt>::type  KeysLoadIt;
-      typedef typename core::LoadIterator<PtxPlan, ItemsIt>::type ItemsLoadIt;
-
-      typedef typename core::BlockLoad<PtxPlan, KeysLoadIt>::type  BlockLoadKeys;
-      typedef typename core::BlockLoad<PtxPlan, ItemsLoadIt>::type BlockLoadItems;
-
-      typedef typename core::BlockStore<PtxPlan, KeysIt>::type     BlockStoreKeysIt;
-      typedef typename core::BlockStore<PtxPlan, ItemsIt>::type    BlockStoreItemsIt;
-      typedef typename core::BlockStore<PtxPlan, key_type*>::type  BlockStoreKeysRaw;
-      typedef typename core::BlockStore<PtxPlan, item_type*>::type BlockStoreItemsRaw;
-
-      union TempStorage
-      {
-        typename BlockLoadKeys::TempStorage   load_keys;
-        typename BlockLoadItems::TempStorage  load_items;
-        typename BlockStoreKeysIt::TempStorage  store_keys_it;
-        typename BlockStoreItemsIt::TempStorage store_items_it;
-        typename BlockStoreKeysRaw::TempStorage  store_keys_raw;
-        typename BlockStoreItemsRaw::TempStorage store_items_raw;
-
-        core::uninitialized_array<key_type, PtxPlan::ITEMS_PER_TILE + 1>  keys_shared;
-        core::uninitialized_array<item_type, PtxPlan::ITEMS_PER_TILE + 1> items_shared;
-      };    // union TempStorage
-    };      // struct PtxPlan
-
-    typedef typename core::specialize_plan_msvc10_war<PtxPlan>::type::type ptx_plan;
-
-    typedef typename ptx_plan::KeysLoadIt         KeysLoadIt;
-    typedef typename ptx_plan::ItemsLoadIt        ItemsLoadIt;
-    typedef typename ptx_plan::BlockLoadKeys      BlockLoadKeys;
-    typedef typename ptx_plan::BlockLoadItems     BlockLoadItems;
-    typedef typename ptx_plan::BlockStoreKeysIt   BlockStoreKeysIt;
-    typedef typename ptx_plan::BlockStoreItemsIt  BlockStoreItemsIt;
-    typedef typename ptx_plan::BlockStoreKeysRaw  BlockStoreKeysRaw;
-    typedef typename ptx_plan::BlockStoreItemsRaw BlockStoreItemsRaw;
-    typedef typename ptx_plan::TempStorage        TempStorage;
-
-    enum
-    {
-      ITEMS_PER_THREAD = ptx_plan::ITEMS_PER_THREAD,
-      BLOCK_THREADS    = ptx_plan::BLOCK_THREADS,
-      ITEMS_PER_TILE   = ptx_plan::ITEMS_PER_TILE
-    };
-
-    struct impl
-    {
-      //---------------------------------------------------------------------
-      // Per thread data
-      //---------------------------------------------------------------------
-
-      bool         ping;
-      TempStorage& storage;
-      KeysLoadIt   keys_in;
-      ItemsLoadIt  items_in;
-      Size         keys_count;
-      KeysIt       keys_out_it;
-      ItemsIt      items_out_it;
-      key_type*    keys_out_raw;
-      item_type*   items_out_raw;
-      CompareOp    compare_op;
-
-      //---------------------------------------------------------------------
-      // Serial stable sort network 
-      //---------------------------------------------------------------------
-
-      THRUST_DEVICE_FUNCTION
-      void stable_odd_even_sort(key_type (&keys)[ITEMS_PER_THREAD],
-                                item_type (&items)[ITEMS_PER_THREAD])
-      {
-#pragma unroll
-        for (int I = 0; I < ITEMS_PER_THREAD; ++I)
-        {
-#pragma unroll
-          for (int J = 1 & I; J < ITEMS_PER_THREAD - 1; J += 2)
-          {
-            if (compare_op(keys[J + 1], keys[J]))
-            {
-              using thrust::swap;
-              swap(keys[J], keys[J + 1]);
-              if (SORT_ITEMS::value)
-              {
-                swap(items[J], items[J + 1]);
-              }
-            }
-          }    // inner loop
-        }      // outer loop
-      }
-
-      //---------------------------------------------------------------------
-      // Parallel thread block merge sort
-      //---------------------------------------------------------------------
-
-      THRUST_DEVICE_FUNCTION void
-      block_mergesort(int tid,
-                      int count,
-                      key_type (&keys_loc)[ITEMS_PER_THREAD],
-                      item_type (&items_loc)[ITEMS_PER_THREAD])
-      {
-        using core::uninitialized_array;
-        using core::sync_threadblock;
-
-        // stable sort items in a single thread
-        //
-        stable_odd_even_sort(keys_loc,items_loc);
-
-        // each thread has  sorted keys_loc
-        // merge sort keys_loc in shared memory
-        //
-#pragma unroll
-        for (int coop = 2; coop <= BLOCK_THREADS; coop *= 2)
-        {
-          sync_threadblock();
-
-          // store keys in shmem
-          //
-#pragma unroll
-          for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-          {
-            int idx                  = ITEMS_PER_THREAD * threadIdx.x + ITEM;
-            storage.keys_shared[idx] = keys_loc[ITEM];
-          }
-
-          sync_threadblock();
-
-          int  indices[ITEMS_PER_THREAD];
-
-          int list  = ~(coop - 1) & tid;
-          int start = ITEMS_PER_THREAD * list;
-          int size  = ITEMS_PER_THREAD * (coop >> 1);
-
-          int diag = min(count,
-                         ITEMS_PER_THREAD * ((coop - 1) & tid));
-
-          int keys1_beg = min(count, start);
-          int keys1_end = min(count, keys1_beg + size);
-          int keys2_beg = keys1_end;
-          int keys2_end = min(count, keys2_beg + size);
-
-          int keys1_count = keys1_end - keys1_beg;
-          int keys2_count = keys2_end - keys2_beg;
-
-          int partition_diag = merge_path(&storage.keys_shared[keys1_beg],
-                                          &storage.keys_shared[keys2_beg],
-                                          keys1_count,
-                                          keys2_count,
-                                          diag,
-                                          compare_op);
-
-          int keys1_beg_loc   = keys1_beg + partition_diag;
-          int keys1_end_loc   = keys1_end;
-          int keys2_beg_loc   = keys2_beg + diag - partition_diag;
-          int keys2_end_loc   = keys2_end;
-          int keys1_count_loc = keys1_end_loc - keys1_beg_loc;
-          int keys2_count_loc = keys2_end_loc - keys2_beg_loc;
-          serial_merge(&storage.keys_shared[0],
-                       keys1_beg_loc,
-                       keys2_beg_loc,
-                       keys1_count_loc,
-                       keys2_count_loc,
-                       keys_loc,
-                       indices,
-                       compare_op);
-
-
-          if (SORT_ITEMS::value)
-          {
-            sync_threadblock();
-
-            // store keys in shmem
-            //
-#pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-            {
-              int idx                   = ITEMS_PER_THREAD * threadIdx.x + ITEM;
-              storage.items_shared[idx] = items_loc[ITEM];
-            }
-
-            sync_threadblock();
-
-            // gather items from shmem
-            //
-#pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-            {
-              items_loc[ITEM] = storage.items_shared[indices[ITEM]];
-            }
-          }
-        }
-      }    // func block_merge_sort
-      
-      //---------------------------------------------------------------------
-      // Tile processing 
-      //---------------------------------------------------------------------
-
-      template <bool IS_LAST_TILE>
-      THRUST_DEVICE_FUNCTION void
-      consume_tile(int  tid,
-                   Size /*tile_idx*/,
-                   Size tile_base,
-                   int  num_remaining)
-      {
-        using core::uninitialized_array;
-        using core::sync_threadblock;
-
-        item_type items_loc[ITEMS_PER_THREAD];
-        if (SORT_ITEMS::value)
-        {
-          BlockLoadItems(storage.load_items)
-              .Load(items_in + tile_base,
-                    items_loc,
-                    num_remaining,
-                    *(items_in + tile_base));
-
-          sync_threadblock();
-        }
-
-        key_type keys_loc[ITEMS_PER_THREAD];
-        if (IS_LAST_TILE)
-        {
-          BlockLoadKeys(storage.load_keys)
-              .Load(keys_in + tile_base,
-                    keys_loc,
-                    num_remaining,
-                    *(keys_in + tile_base));
-        }
-        else
-        {
-          BlockLoadKeys(storage.load_keys)
-              .Load(keys_in + tile_base, keys_loc);
-        }
-
-        if (IS_LAST_TILE)
-        {
-          // if last tile, find valid max_key
-          // and fill the remainig keys with it
-          //
-          key_type max_key = keys_loc[0];
-#pragma unroll
-          for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM)
-          {
-            if (ITEMS_PER_THREAD * tid + ITEM < num_remaining)
-            {
-              max_key = compare_op(max_key, keys_loc[ITEM])
-                            ? keys_loc[ITEM]
-                            : max_key;
-            }
-            else
-            {
-              keys_loc[ITEM] = max_key;
-            }
-          }
-        }
-
-        sync_threadblock();
-
-        if (IS_LAST_TILE)
-        {
-          block_mergesort(tid,
-                          num_remaining,
-                          keys_loc,
-                          items_loc);
-        }
-        else
-        {
-          block_mergesort(tid,
-                          ITEMS_PER_TILE,
-                          keys_loc,
-                          items_loc);
-        }
-
-        sync_threadblock();
-
-        if (ping)
-        {
-          if (IS_LAST_TILE)
-          {
-            BlockStoreKeysIt(storage.store_keys_it)
-                .Store(keys_out_it + tile_base, keys_loc, num_remaining);
-          }
-          else
-          {
-            BlockStoreKeysIt(storage.store_keys_it)
-                .Store(keys_out_it + tile_base, keys_loc);
-          }
-
-          if (SORT_ITEMS::value)
-          {
-            sync_threadblock();
-
-            BlockStoreItemsIt(storage.store_items_it)
-                .Store(items_out_it + tile_base, items_loc, num_remaining);
-          }
-        }
-        else
-        {
-          if (IS_LAST_TILE)
-          {
-            BlockStoreKeysRaw(storage.store_keys_raw)
-                .Store(keys_out_raw + tile_base, keys_loc, num_remaining);
-          }
-          else
-          {
-            BlockStoreKeysRaw(storage.store_keys_raw)
-                .Store(keys_out_raw + tile_base, keys_loc);
-          }
-
-          if (SORT_ITEMS::value)
-          {
-            sync_threadblock();
-
-            BlockStoreItemsRaw(storage.store_items_raw)
-                .Store(items_out_raw + tile_base, items_loc, num_remaining);
-          }
-        }
-      }
-
-      //---------------------------------------------------------------------
-      // Constructor 
-      //---------------------------------------------------------------------
-
-      THRUST_DEVICE_FUNCTION
-      impl(bool         ping_,
-           TempStorage& storage_,
-           KeysLoadIt   keys_in_,
-           ItemsLoadIt  items_in_,
-           Size         keys_count_,
-           KeysIt       keys_out_it_,
-           ItemsIt      items_out_it_,
-           key_type*    keys_out_raw_,
-           item_type*   items_out_raw_,
-           CompareOp    compare_op_)
-          : ping(ping_),
-            storage(storage_),
-            keys_in(keys_in_),
-            items_in(items_in_),
-            keys_count(keys_count_),
-            keys_out_it(keys_out_it_),
-            items_out_it(items_out_it_),
-            keys_out_raw(keys_out_raw_),
-            items_out_raw(items_out_raw_),
-            compare_op(compare_op_)
-      {
-        int  tid           = threadIdx.x;
-        Size tile_idx      = blockIdx.x;
-        Size num_tiles     = gridDim.x;
-        Size tile_base     = tile_idx * ITEMS_PER_TILE;
-        int  items_in_tile = min<int>(keys_count - tile_base, ITEMS_PER_TILE);
-        if (tile_idx < num_tiles - 1)
-        {
-          consume_tile<false>(tid, tile_idx, tile_base, ITEMS_PER_TILE);
-        }
-        else
-        {
-          consume_tile<true>(tid, tile_idx, tile_base, items_in_tile);
-        }
-      }
-    };    // struct impl
-
-    //---------------------------------------------------------------------
-    // Agent entry point
-    //---------------------------------------------------------------------
-
-    THRUST_AGENT_ENTRY(bool       ping,
-                       KeysIt     keys_inout,
-                       ItemsIt    items_inout,
-                       Size       keys_count,
-                       key_type*  keys_out,
-                       item_type* items_out,
-                       CompareOp  compare_op,
-                       char*      shmem)
-    {
-      TempStorage& storage = *reinterpret_cast<TempStorage*>(shmem);
-
-      impl(ping,
-           storage,
-           core::make_load_iterator(ptx_plan(), keys_inout),
-           core::make_load_iterator(ptx_plan(), items_inout),
-           keys_count,
-           keys_inout,
-           items_inout,
-           keys_out,
-           items_out,
-           compare_op);
-    }
-  };    // struct BlockSortAgent
-
-  template <class KeysIt,
             class Size,
             class CompareOp>
-  struct PartitionAgent
+  THRUST_RUNTIME_FUNCTION cudaError_t
+  doit_step(void*        d_temp_storage,
+            size_t&      temp_storage_bytes,
+            KeysIt       keys,
+            ItemsIt      ,
+            Size         keys_count,
+            CompareOp    compare_op,
+            cudaStream_t stream,
+            thrust::detail::integral_constant<bool, false> /* sort_keys */)
   {
-    typedef typename iterator_traits<KeysIt>::value_type key_type;
-    template<class Arch>
-    struct PtxPlan : PtxPolicy<256> {};
-
-    typedef core::specialize_plan<PtxPlan> ptx_plan;
-    
-    //---------------------------------------------------------------------
-    // Agent entry point
-    //---------------------------------------------------------------------
-
-    THRUST_AGENT_ENTRY(bool      ping,
-                       KeysIt    keys_ping,
-                       key_type* keys_pong,
-                       Size      keys_count,
-                       Size      num_partitions,
-                       Size*     merge_partitions,
-                       CompareOp compare_op,
-                       Size      coop,
-                       int       items_per_tile,
-                       char*     /*shmem*/)
-    {
-      Size partition_idx = blockDim.x * blockIdx.x + threadIdx.x;
-      if (partition_idx < num_partitions)
-      {
-        Size list  = ~(coop - 1) & partition_idx;
-        Size start = items_per_tile * list;
-        Size size  = items_per_tile * (coop >> 1);
-
-        Size keys1_beg = min(keys_count, start);
-        Size keys1_end = min(keys_count, start + size);
-        Size keys2_beg = keys1_end;
-        Size keys2_end = min(keys_count, keys2_beg + size);
-
-
-        Size partition_at = min(keys2_end - keys1_beg,
-                                items_per_tile * ((coop - 1) & partition_idx));
-
-        Size partition_diag = ping ? merge_path(keys_ping + keys1_beg,
-                                                keys_ping + keys2_beg,
-                                                keys1_end - keys1_beg,
-                                                keys2_end - keys2_beg,
-                                                partition_at,
-                                                compare_op)
-                                   : merge_path(keys_pong + keys1_beg,
-                                                keys_pong + keys2_beg,
-                                                keys1_end - keys1_beg,
-                                                keys2_end - keys2_beg,
-                                                partition_at,
-                                                compare_op);
-
-
-        merge_partitions[partition_idx] = keys1_beg + partition_diag;
-      }
-    }
-  };    // struct PartitionAgent
+    using ItemsInputIt = cub::NullType *;
+    ItemsInputIt items = nullptr;
+
+    using DispatchMergeSortT = cub::DispatchMergeSort<KeysIt,
+                                                      ItemsInputIt,
+                                                      KeysIt,
+                                                      ItemsInputIt,
+                                                      Size,
+                                                      CompareOp>;
+
+
+    return DispatchMergeSortT::Dispatch(d_temp_storage,
+                                        temp_storage_bytes,
+                                        keys,
+                                        items,
+                                        keys,
+                                        items,
+                                        keys_count,
+                                        compare_op,
+                                        stream);
+  }
 
   template <class KeysIt,
             class ItemsIt,
             class Size,
-            class CompareOp,
-            class MERGE_ITEMS>
-  struct MergeAgent
+            class CompareOp>
+  THRUST_RUNTIME_FUNCTION cudaError_t
+  doit_step(void *d_temp_storage,
+            size_t &temp_storage_bytes,
+            KeysIt keys,
+            ItemsIt items,
+            Size keys_count,
+            CompareOp compare_op,
+            cudaStream_t stream,
+            thrust::detail::integral_constant<bool, true> /* sort_items */)
   {
-    typedef typename iterator_traits<KeysIt>::value_type  key_type;
-    typedef typename iterator_traits<ItemsIt>::value_type item_type;
-
-    typedef KeysIt     KeysOutputPongIt;
-    typedef ItemsIt    ItemsOutputPongIt;
-    typedef key_type*  KeysOutputPingIt;
-    typedef item_type* ItemsOutputPingIt;
-
-    template<class Arch>
-    struct PtxPlan : Tuning<Arch,key_type>::type
-    {
-      typedef Tuning<Arch,key_type> tuning;
-
-      typedef typename core::LoadIterator<PtxPlan, KeysIt>::type     KeysLoadPingIt;
-      typedef typename core::LoadIterator<PtxPlan, ItemsIt>::type    ItemsLoadPingIt;
-      typedef typename core::LoadIterator<PtxPlan, key_type*>::type  KeysLoadPongIt;
-      typedef typename core::LoadIterator<PtxPlan, item_type*>::type ItemsLoadPongIt;
-
-      typedef typename core::BlockLoad<PtxPlan, KeysLoadPingIt>::type  BlockLoadKeysPing;
-      typedef typename core::BlockLoad<PtxPlan, ItemsLoadPingIt>::type BlockLoadItemsPing;
-      typedef typename core::BlockLoad<PtxPlan, KeysLoadPongIt>::type  BlockLoadKeysPong;
-      typedef typename core::BlockLoad<PtxPlan, ItemsLoadPongIt>::type BlockLoadItemsPong;
-
-      typedef typename core::BlockStore<PtxPlan, KeysOutputPongIt>::type  BlockStoreKeysPong;
-      typedef typename core::BlockStore<PtxPlan, ItemsOutputPongIt>::type BlockStoreItemsPong;
-      typedef typename core::BlockStore<PtxPlan, KeysOutputPingIt>::type  BlockStoreKeysPing;
-      typedef typename core::BlockStore<PtxPlan, ItemsOutputPingIt>::type BlockStoreItemsPing;
-
-      // gather required temporary storage in a union
-      //
-      union TempStorage
-      {
-        typename BlockLoadKeysPing::TempStorage  load_keys_ping;
-        typename BlockLoadItemsPing::TempStorage load_items_ping;
-        typename BlockLoadKeysPong::TempStorage  load_keys_pong;
-        typename BlockLoadItemsPong::TempStorage load_items_pong;
-
-        typename BlockStoreKeysPing::TempStorage  store_keys_ping;
-        typename BlockStoreItemsPing::TempStorage store_items_ping;
-        typename BlockStoreKeysPong::TempStorage  store_keys_pong;
-        typename BlockStoreItemsPong::TempStorage store_items_pong;
-
-        core::uninitialized_array<key_type, PtxPlan::ITEMS_PER_TILE + 1>  keys_shared;
-        core::uninitialized_array<item_type, PtxPlan::ITEMS_PER_TILE + 1> items_shared;
-      };    // union TempStorage
-    };    // struct PtxPlan
-
-    typedef typename core::specialize_plan_msvc10_war<PtxPlan>::type::type ptx_plan;
-
-    typedef typename ptx_plan::KeysLoadPingIt  KeysLoadPingIt;
-    typedef typename ptx_plan::ItemsLoadPingIt ItemsLoadPingIt;
-    typedef typename ptx_plan::KeysLoadPongIt  KeysLoadPongIt;
-    typedef typename ptx_plan::ItemsLoadPongIt ItemsLoadPongIt;
-
-    typedef typename ptx_plan::BlockLoadKeysPing  BlockLoadKeysPing;
-    typedef typename ptx_plan::BlockLoadItemsPing BlockLoadItemsPing;
-    typedef typename ptx_plan::BlockLoadKeysPong  BlockLoadKeysPong;
-    typedef typename ptx_plan::BlockLoadItemsPong BlockLoadItemsPong;
-
-    typedef typename ptx_plan::BlockStoreKeysPing  BlockStoreKeysPing;
-    typedef typename ptx_plan::BlockStoreItemsPing BlockStoreItemsPing;
-    typedef typename ptx_plan::BlockStoreKeysPong  BlockStoreKeysPong;
-    typedef typename ptx_plan::BlockStoreItemsPong BlockStoreItemsPong;
-
-    typedef typename ptx_plan::TempStorage     TempStorage;
-
-    enum
-    {
-      ITEMS_PER_THREAD = ptx_plan::ITEMS_PER_THREAD,
-      BLOCK_THREADS    = ptx_plan::BLOCK_THREADS,
-      ITEMS_PER_TILE   = ptx_plan::ITEMS_PER_TILE
-    };
-
-    struct impl
-    {
-      //---------------------------------------------------------------------
-      // Per thread data
-      //---------------------------------------------------------------------
-
-      bool            ping;
-      TempStorage&    storage;
-
-      KeysLoadPingIt  keys_in_ping;
-      ItemsLoadPingIt items_in_ping;
-      KeysLoadPongIt  keys_in_pong;
-      ItemsLoadPongIt items_in_pong;
-
-      Size            keys_count;
-
-      KeysOutputPongIt  keys_out_pong;
-      ItemsOutputPongIt items_out_pong;
-      KeysOutputPingIt  keys_out_ping;
-      ItemsOutputPingIt items_out_ping;
-
-      CompareOp       compare_op;
-      Size*           merge_partitions;
-      Size            coop;
-
-      //---------------------------------------------------------------------
-      // Utility functions
-      //---------------------------------------------------------------------
-      
-      template <bool IS_FULL_TILE, class T, class It1, class It2>
-      THRUST_DEVICE_FUNCTION void
-      gmem_to_reg(T (&output)[ITEMS_PER_THREAD],
-                  It1 input1,
-                  It2 input2,
-                  int count1,
-                  int count2)
-      {
-        if (IS_FULL_TILE)
-        {
-#pragma unroll
-          for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-          {
-            int idx = BLOCK_THREADS * ITEM + threadIdx.x;
-            output[ITEM] = (idx < count1) ? input1[idx] : input2[idx - count1];
-          }
-        }
-        else
-        {
-#pragma unroll
-          for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-          {
-            int idx = BLOCK_THREADS * ITEM + threadIdx.x;
-            if (idx < count1 + count2)
-            {
-              output[ITEM] = (idx < count1) ? input1[idx] : input2[idx - count1];
-            }
-          }
-        }
-      }
-
-      template <class T, class It>
-      THRUST_DEVICE_FUNCTION void
-      reg_to_shared(It output,
-                    T (&input)[ITEMS_PER_THREAD])
-      {
-#pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-          int idx = BLOCK_THREADS * ITEM + threadIdx.x;
-          output[idx] = input[ITEM];
-        }
-      }
-
-      //---------------------------------------------------------------------
-      // Tile processing 
-      //---------------------------------------------------------------------
-
-      template <bool IS_FULL_TILE>
-      THRUST_DEVICE_FUNCTION void
-      consume_tile(int  tid,
-                   Size tile_idx,
-                   Size tile_base,
-                   int  count)
-      {
-        using core::sync_threadblock;
-        using core::uninitialized_array;
-
-        Size partition_beg = merge_partitions[tile_idx + 0];
-        Size partition_end = merge_partitions[tile_idx + 1];
-
-        Size list = ~(coop - 1) & tile_idx;
-        Size start = ITEMS_PER_TILE * list;
-        Size size  = ITEMS_PER_TILE * (coop >> 1);
-
-        Size diag   = ITEMS_PER_TILE * tile_idx - start;
-
-        Size keys1_beg = partition_beg;
-        Size keys1_end = partition_end;
-        Size keys2_beg = min<Size>(keys_count, 2 * start + size + diag - partition_beg);
-        Size keys2_end = min<Size>(keys_count, 2 * start + size + diag + ITEMS_PER_TILE - partition_end);
-
-        if (coop - 1 == ((coop - 1) & tile_idx))
-        {
-          keys1_end = min(keys_count, start + size);
-          keys2_end = min(keys_count, start + size * 2);
-        }
-
-        // number of keys per tile
-        //
-        int num_keys1 = static_cast<int>(keys1_end - keys1_beg);
-        int num_keys2 = static_cast<int>(keys2_end - keys2_beg);
-
-        // load keys1 & keys2
-        key_type keys_loc[ITEMS_PER_THREAD];
-        if (ping)
-        {
-          gmem_to_reg<IS_FULL_TILE>(keys_loc,
-                                    keys_in_ping + keys1_beg,
-                                    keys_in_ping + keys2_beg,
-                                    num_keys1,
-                                    num_keys2);
-        }
-        else
-        {
-          gmem_to_reg<IS_FULL_TILE>(keys_loc,
-                                    keys_in_pong + keys1_beg,
-                                    keys_in_pong + keys2_beg,
-                                    num_keys1,
-                                    num_keys2);
-        }
-        reg_to_shared(&storage.keys_shared[0], keys_loc);
-        
-        // preload items into registers already
-        //
-        item_type items_loc[ITEMS_PER_THREAD];
-        if (MERGE_ITEMS::value)
-        {
-          if (ping)
-          {
-            gmem_to_reg<IS_FULL_TILE>(items_loc,
-                                      items_in_ping + keys1_beg,
-                                      items_in_ping + keys2_beg,
-                                      num_keys1,
-                                      num_keys2);
-          }
-          else
-          {
-            gmem_to_reg<IS_FULL_TILE>(items_loc,
-                                      items_in_pong + keys1_beg,
-                                      items_in_pong + keys2_beg,
-                                      num_keys1,
-                                      num_keys2);
-          }
-        }
-
-        sync_threadblock();
-
-        // use binary search in shared memory
-        // to find merge path for each of thread
-        // we can use int type here, because the number of
-        // items in shared memory is limited
-        //
-        int diag0_loc = min<Size>(num_keys1 + num_keys2,
-                                  ITEMS_PER_THREAD * tid);
-
-        int keys1_beg_loc = merge_path(&storage.keys_shared[0],
-                                       &storage.keys_shared[num_keys1],
-                                       num_keys1,
-                                       num_keys2,
-                                       diag0_loc,
-                                       compare_op);
-        int keys1_end_loc = num_keys1;
-        int keys2_beg_loc = diag0_loc - keys1_beg_loc;
-        int keys2_end_loc = num_keys2;
-
-        int num_keys1_loc = keys1_end_loc - keys1_beg_loc;
-        int num_keys2_loc = keys2_end_loc - keys2_beg_loc;
-
-        // perform serial merge
-        //
-        int indices[ITEMS_PER_THREAD];
-
-        serial_merge(&storage.keys_shared[0],
-                     keys1_beg_loc,
-                     keys2_beg_loc + num_keys1,
-                     num_keys1_loc,
-                     num_keys2_loc,
-                     keys_loc,
-                     indices,
-                     compare_op);
-
-        sync_threadblock();
-
-        // write keys
-        //
-        if (ping)
-        {
-          if (IS_FULL_TILE)
-          {
-            BlockStoreKeysPing(storage.store_keys_ping)
-                .Store(keys_out_ping + tile_base, keys_loc);
-          }
-          else
-          {
-            BlockStoreKeysPing(storage.store_keys_ping)
-                .Store(keys_out_ping + tile_base, keys_loc, num_keys1 + num_keys2);
-          }
-        }
-        else
-        {
-          if (IS_FULL_TILE)
-          {
-            BlockStoreKeysPong(storage.store_keys_pong)
-                .Store(keys_out_pong + tile_base, keys_loc);
-          }
-          else
-          {
-            BlockStoreKeysPong(storage.store_keys_pong)
-                .Store(keys_out_pong + tile_base, keys_loc, num_keys1 + num_keys2);
-          }
-        }
-
-        // if items are provided, merge them
-        if (MERGE_ITEMS::value)
-        {
-          sync_threadblock();
-
-          reg_to_shared(&storage.items_shared[0], items_loc);
-
-          sync_threadblock();
-
-          // gather items from shared mem
-          //
-#pragma unroll
-          for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-          {
-            items_loc[ITEM] = storage.items_shared[indices[ITEM]];
-          }
-
-          sync_threadblock();
-
-          // write from reg to gmem
-          //
-          if (ping)
-          {
-            if (IS_FULL_TILE)
-            {
-              BlockStoreItemsPing(storage.store_items_ping)
-                  .Store(items_out_ping + tile_base, items_loc);
-            }
-            else
-            {
-              BlockStoreItemsPing(storage.store_items_ping)
-                  .Store(items_out_ping + tile_base, items_loc, count);
-            }
-          }
-          else
-          {
-            if (IS_FULL_TILE)
-            {
-              BlockStoreItemsPong(storage.store_items_pong)
-                  .Store(items_out_pong + tile_base, items_loc);
-            }
-            else
-            {
-              BlockStoreItemsPong(storage.store_items_pong)
-                  .Store(items_out_pong + tile_base, items_loc, count);
-            }
-          }
-        }
-      }
-
-      //---------------------------------------------------------------------
-      // Constructor 
-      //---------------------------------------------------------------------
-
-      THRUST_DEVICE_FUNCTION
-      impl(bool              ping_,
-           TempStorage&      storage_,
-           KeysLoadPingIt    keys_in_ping_,
-           ItemsLoadPingIt   items_in_ping_,
-           KeysLoadPongIt    keys_in_pong_,
-           ItemsLoadPongIt   items_in_pong_,
-           Size              keys_count_,
-           KeysOutputPingIt  keys_out_ping_,
-           ItemsOutputPingIt items_out_ping_,
-           KeysOutputPongIt  keys_out_pong_,
-           ItemsOutputPongIt items_out_pong_,
-           CompareOp         compare_op_,
-           Size*             merge_partitions_,
-           Size              coop_)
-          : ping(ping_),
-            storage(storage_),
-            keys_in_ping(keys_in_ping_),
-            items_in_ping(items_in_ping_),
-            keys_in_pong(keys_in_pong_),
-            items_in_pong(items_in_pong_),
-            keys_count(keys_count_),
-            keys_out_pong(keys_out_pong_),
-            items_out_pong(items_out_pong_),
-            keys_out_ping(keys_out_ping_),
-            items_out_ping(items_out_ping_),
-            compare_op(compare_op_),
-            merge_partitions(merge_partitions_),
-            coop(coop_)
-      {
-        // XXX with 8.5 chaging type to Size (or long long) results in error!
-        int  tile_idx      = blockIdx.x;
-        Size num_tiles     = gridDim.x;
-        Size tile_base     = Size(tile_idx) * ITEMS_PER_TILE;
-        int tid           = threadIdx.x;
-        int items_in_tile = static_cast<int>(min((Size)ITEMS_PER_TILE,
-                                                 keys_count - tile_base));
-        if (tile_idx < num_tiles - 1)
-        {
-          consume_tile<true>(tid,
-                             tile_idx,
-                             tile_base,
-                             ITEMS_PER_TILE);
-        }
-        else
-        {
-          consume_tile<false>(tid,
-                              tile_idx,
-                              tile_base,
-                              items_in_tile);
-        }
-      }
-    };    // struct impl
-
-    //---------------------------------------------------------------------
-    // Agent entry point
-    //---------------------------------------------------------------------
-
-    THRUST_AGENT_ENTRY(bool       ping,
-                       KeysIt     keys_ping,
-                       ItemsIt    items_ping,
-                       Size       keys_count,
-                       key_type*  keys_pong,
-                       item_type* items_pong,
-                       CompareOp  compare_op,
-                       Size*      merge_partitions,
-                       Size       coop,
-                       char*      shmem)
-    {
-      TempStorage& storage = *reinterpret_cast<TempStorage*>(shmem);
-
-      impl(ping,
-           storage,
-           core::make_load_iterator(ptx_plan(), keys_ping),
-           core::make_load_iterator(ptx_plan(), items_ping),
-           core::make_load_iterator(ptx_plan(), keys_pong),
-           core::make_load_iterator(ptx_plan(), items_pong),
-           keys_count,
-           keys_pong,
-           items_pong,
-           keys_ping,
-           items_ping,
-           compare_op,
-           merge_partitions,
-           coop);
-    }
-  };    // struct MergeAgent;
-
-  /////////////////////////
+    using DispatchMergeSortT =
+      cub::DispatchMergeSort<KeysIt, ItemsIt, KeysIt, ItemsIt, Size, CompareOp>;
+
+    return DispatchMergeSortT::Dispatch(d_temp_storage,
+                                        temp_storage_bytes,
+                                        keys,
+                                        items,
+                                        keys,
+                                        items,
+                                        keys_count,
+                                        compare_op,
+                                        stream);
+  }
 
   template <class SORT_ITEMS,
-            class STABLE,
+            class /* STABLE */,
             class KeysIt,
             class ItemsIt,
             class Size,
             class CompareOp>
   THRUST_RUNTIME_FUNCTION cudaError_t
-  doit_step(void*        d_temp_storage,
-            size_t&      temp_storage_bytes,
-            KeysIt       keys,
-            ItemsIt      items,
-            Size         keys_count,
-            CompareOp    compare_op,
-            cudaStream_t stream,
-            bool         debug_sync)
+  doit_step(void *d_temp_storage,
+            size_t &temp_storage_bytes,
+            KeysIt keys,
+            ItemsIt items,
+            Size keys_count,
+            CompareOp compare_op,
+            cudaStream_t stream)
   {
-    using core::AgentPlan;
-    using core::get_agent_plan;
-
-    typedef typename iterator_traits<KeysIt>::value_type  key_type;
-    typedef typename iterator_traits<ItemsIt>::value_type item_type;
-
-    typedef core::AgentLauncher<
-        BlockSortAgent<KeysIt,
-                       ItemsIt,
-                       Size,
-                       CompareOp,
-                       SORT_ITEMS,
-                       STABLE> >
-        block_sort_agent;
-
-    typedef core::AgentLauncher<PartitionAgent<KeysIt, Size, CompareOp> >
-        partition_agent;
-
-    typedef core::AgentLauncher<
-        MergeAgent<KeysIt,
-                   ItemsIt,
-                   Size,
-                   CompareOp,
-                   SORT_ITEMS> >
-        merge_agent;
-
-    cudaError_t status = cudaSuccess;
-
     if (keys_count == 0)
-      return status;
-
-    typename core::get_plan<partition_agent>::type partition_plan =
-        partition_agent::get_plan();
-
-    typename core::get_plan<merge_agent>::type merge_plan =
-        merge_agent::get_plan(stream);
-
-    AgentPlan block_sort_plan = merge_plan;
-
-    int tile_size = merge_plan.items_per_tile;
-    Size num_tiles = (keys_count + tile_size - 1) / tile_size;
-
-    size_t temp_storage1 = (1 + num_tiles) * sizeof(Size);
-    size_t temp_storage2 = keys_count * sizeof(key_type);
-    size_t temp_storage3 = keys_count * sizeof(item_type) * SORT_ITEMS::value;
-    size_t temp_storage4 = core::vshmem_size(max(block_sort_plan.shared_memory_size,
-                                                 merge_plan.shared_memory_size),
-                                             num_tiles);
-
-    void*  allocations[4]      = {NULL, NULL, NULL, NULL};
-    size_t allocation_sizes[4] = {temp_storage1, temp_storage2, temp_storage3, temp_storage4};
-
-    status = core::alias_storage(d_temp_storage,
-                                 temp_storage_bytes,
-                                 allocations,
-                                 allocation_sizes);
-    CUDA_CUB_RET_IF_FAIL(status);
-
-    if (d_temp_storage == NULL)
-    {
-      return status;
-    };
-
-    int num_passes = thrust::detail::log2_ri(num_tiles);
-    bool ping = !(1 & num_passes);
-
-    Size*      merge_partitions = (Size*)allocations[0];
-    key_type*  keys_buffer      = (key_type*)allocations[1];
-    item_type* items_buffer     = (item_type*)allocations[2];
-
-    char* vshmem_ptr = temp_storage4 > 0 ? (char*)allocations[3] : NULL;
-
-
-    block_sort_agent(block_sort_plan, keys_count, stream, vshmem_ptr, "block_sort_agent", debug_sync)
-        .launch(ping, keys, items, keys_count, keys_buffer, items_buffer, compare_op);
-    CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
-
-    size_t num_partitions = num_tiles + 1;
-
-    partition_agent pa(partition_plan, num_partitions, stream, "partition_agent", debug_sync);
-    merge_agent     ma(merge_plan, keys_count, stream, vshmem_ptr, "merge_agent", debug_sync);
-
-    for (int pass = 0; pass < num_passes; ++pass, ping = !ping)
     {
-      Size coop = Size(2) << pass;
-
-      pa.launch(ping,
-                keys,
-                keys_buffer,
-                keys_count,
-                num_partitions,
-                merge_partitions,
-                compare_op,
-                coop,
-                merge_plan.items_per_tile);
-      CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
-
-
-      ma.launch(ping,
-                keys,
-                items,
-                keys_count,
-                keys_buffer,
-                items_buffer,
-                compare_op,
-                merge_partitions,
-                coop);
-      CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
+      return cudaSuccess;
     }
 
-    return status;
+    thrust::detail::integral_constant<bool, SORT_ITEMS::value> sort_items{};
+
+    return doit_step(d_temp_storage,
+                     temp_storage_bytes,
+                     keys,
+                     items,
+                     keys_count,
+                     compare_op,
+                     stream,
+                     sort_items);
   }
 
   template <typename SORT_ITEMS,
@@ -1271,7 +160,7 @@ namespace __merge_sort {
             typename KeysIt,
             typename ItemsIt,
             typename CompareOp>
-  THRUST_RUNTIME_FUNCTION 
+  THRUST_RUNTIME_FUNCTION
   void merge_sort(execution_policy<Derived>& policy,
                   KeysIt                     keys_first,
                   KeysIt                     keys_last,
@@ -1285,7 +174,6 @@ namespace __merge_sort {
 
     size_t       storage_size = 0;
     cudaStream_t stream       = cuda_cub::stream(policy);
-    bool         debug_sync   = THRUST_DEBUG_SYNC_FLAG;
 
     cudaError_t status;
     status = doit_step<SORT_ITEMS, STABLE>(NULL,
@@ -1294,8 +182,7 @@ namespace __merge_sort {
                                            items_first,
                                            count,
                                            compare_op,
-                                           stream,
-                                           debug_sync);
+                                           stream);
     cuda_cub::throw_on_error(status, "merge_sort: failed on 1st step");
 
     // Allocate temporary storage.
@@ -1309,11 +196,10 @@ namespace __merge_sort {
                                            items_first,
                                            count,
                                            compare_op,
-                                           stream,
-                                           debug_sync);
+                                           stream);
     cuda_cub::throw_on_error(status, "merge_sort: failed on 2nd step");
 
-    status = cuda_cub::synchronize(policy);
+    status = cuda_cub::synchronize_optional(policy);
     cuda_cub::throw_on_error(status, "merge_sort: failed to synchronize");
   }
 }    // namespace __merge_sort
@@ -1334,8 +220,7 @@ namespace __radix_sort {
          cub::DoubleBuffer<Key>&  keys_buffer,
          cub::DoubleBuffer<Item>& /*items_buffer*/,
          Size                     count,
-         cudaStream_t             stream,
-         bool                     debug_sync)
+         cudaStream_t             stream)
     {
       return cub::DeviceRadixSort::SortKeys(d_temp_storage,
                                             temp_storage_bytes,
@@ -1343,11 +228,10 @@ namespace __radix_sort {
                                             static_cast<int>(count),
                                             0,
                                             static_cast<int>(sizeof(Key) * 8),
-                                            stream,
-                                            debug_sync);
+                                            stream);
     }
   }; // struct dispatch -- sort keys in ascending order;
-  
+
   // sort keys in descending order
   template <class K>
   struct dispatch<thrust::detail::false_type, thrust::greater<K> >
@@ -1359,8 +243,7 @@ namespace __radix_sort {
          cub::DoubleBuffer<Key>&  keys_buffer,
          cub::DoubleBuffer<Item>& /*items_buffer*/,
          Size                     count,
-         cudaStream_t             stream,
-         bool                     debug_sync)
+         cudaStream_t             stream)
     {
       return cub::DeviceRadixSort::SortKeysDescending(d_temp_storage,
                                                       temp_storage_bytes,
@@ -1368,11 +251,10 @@ namespace __radix_sort {
                                                       static_cast<int>(count),
                                                       0,
                                                       static_cast<int>(sizeof(Key) * 8),
-                                                      stream,
-                                                      debug_sync);
+                                                      stream);
     }
   }; // struct dispatch -- sort keys in descending order;
-  
+
   // sort pairs in ascending order
   template <class K>
   struct dispatch<thrust::detail::true_type, thrust::less<K> >
@@ -1384,8 +266,7 @@ namespace __radix_sort {
          cub::DoubleBuffer<Key>&  keys_buffer,
          cub::DoubleBuffer<Item>& items_buffer,
          Size                     count,
-         cudaStream_t             stream,
-         bool                     debug_sync)
+         cudaStream_t             stream)
     {
       return cub::DeviceRadixSort::SortPairs(d_temp_storage,
                                              temp_storage_bytes,
@@ -1394,11 +275,10 @@ namespace __radix_sort {
                                              static_cast<int>(count),
                                              0,
                                              static_cast<int>(sizeof(Key) * 8),
-                                             stream,
-                                             debug_sync);
+                                             stream);
     }
   }; // struct dispatch -- sort pairs in ascending order;
-  
+
   // sort pairs in descending order
   template <class K>
   struct dispatch<thrust::detail::true_type, thrust::greater<K> >
@@ -1410,8 +290,7 @@ namespace __radix_sort {
          cub::DoubleBuffer<Key>&  keys_buffer,
          cub::DoubleBuffer<Item>& items_buffer,
          Size                     count,
-         cudaStream_t             stream,
-         bool                     debug_sync)
+         cudaStream_t             stream)
     {
       return cub::DeviceRadixSort::SortPairsDescending(d_temp_storage,
                                                        temp_storage_bytes,
@@ -1420,8 +299,7 @@ namespace __radix_sort {
                                                        static_cast<int>(count),
                                                        0,
                                                        static_cast<int>(sizeof(Key) * 8),
-                                                       stream,
-                                                       debug_sync);
+                                                       stream);
     }
   }; // struct dispatch -- sort pairs in descending order;
 
@@ -1440,7 +318,6 @@ namespace __radix_sort {
   {
     size_t       temp_storage_bytes = 0;
     cudaStream_t stream             = cuda_cub::stream(policy);
-    bool         debug_sync         = THRUST_DEBUG_SYNC_FLAG;
 
     cub::DoubleBuffer<Key>  keys_buffer(keys, NULL);
     cub::DoubleBuffer<Item> items_buffer(items, NULL);
@@ -1455,8 +332,7 @@ namespace __radix_sort {
                                                    keys_buffer,
                                                    items_buffer,
                                                    keys_count,
-                                                   stream,
-                                                   debug_sync);
+                                                   stream);
     cuda_cub::throw_on_error(status, "radix_sort: failed on 1st step");
 
     size_t keys_temp_storage  = core::align_to(sizeof(Key) * keys_count, 128);
@@ -1471,7 +347,7 @@ namespace __radix_sort {
       tmp(policy, storage_size);
 
     keys_buffer.d_buffers[1]  = thrust::detail::aligned_reinterpret_cast<Key*>(
-      tmp.data().get()  
+      tmp.data().get()
     );
     items_buffer.d_buffers[1] = thrust::detail::aligned_reinterpret_cast<Item*>(
       tmp.data().get() + keys_temp_storage
@@ -1485,8 +361,7 @@ namespace __radix_sort {
                                                    keys_buffer,
                                                    items_buffer,
                                                    keys_count,
-                                                   stream,
-                                                   debug_sync);
+                                                   stream);
     cuda_cub::throw_on_error(status, "radix_sort: failed on 2nd step");
 
     if (keys_buffer.selector != 0)
@@ -1494,10 +369,13 @@ namespace __radix_sort {
       Key* temp_ptr = reinterpret_cast<Key*>(keys_buffer.d_buffers[1]);
       cuda_cub::copy_n(policy, temp_ptr, keys_count, keys);
     }
-    if (SORT_ITEMS::value && items_buffer.selector != 0)
+    THRUST_IF_CONSTEXPR(SORT_ITEMS::value)
     {
-      Item* temp_ptr = reinterpret_cast<Item*>(items_buffer.d_buffers[1]);
-      cuda_cub::copy_n(policy, temp_ptr, items_count, items);
+      if (items_buffer.selector != 0)
+      {
+        Item *temp_ptr = reinterpret_cast<Item *>(items_buffer.d_buffers[1]);
+        cuda_cub::copy_n(policy, temp_ptr, items_count, items);
+      }
     }
   }
 }    // __radix_sort
@@ -1599,6 +477,10 @@ namespace __smart_sort {
     {
       cuda_cub::copy(policy, keys.begin(), keys.end(), keys_first);
     }
+
+    cuda_cub::throw_on_error(
+      cuda_cub::synchronize_optional(policy),
+      "smart_sort: failed to synchronize");
   }
 }    // namespace __smart_sort
 
@@ -1616,18 +498,15 @@ sort(execution_policy<Derived>& policy,
      ItemsIt                    last,
      CompareOp                  compare_op)
 {
-  if (__THRUST_HAS_CUDART__)
-  {
-    typedef typename thrust::iterator_value<ItemsIt>::type item_type;
-    __smart_sort::smart_sort<thrust::detail::false_type, thrust::detail::false_type>(
-        policy, first, last, (item_type*)NULL, compare_op);
-  }
-  else
-  {
-#if !__THRUST_HAS_CUDART__
-    thrust::sort(cvt_to_seq(derived_cast(policy)), first, last, compare_op);
-#endif
-  }
+  THRUST_CDP_DISPATCH(
+    (using item_t = thrust::iterator_value_t<ItemsIt>; item_t *null_ = nullptr;
+     __smart_sort::smart_sort<thrust::detail::false_type,
+                              thrust::detail::false_type>(policy,
+                                                          first,
+                                                          last,
+                                                          null_,
+                                                          compare_op);),
+    (thrust::sort(cvt_to_seq(derived_cast(policy)), first, last, compare_op);));
 }
 
 __thrust_exec_check_disable__
@@ -1638,18 +517,18 @@ stable_sort(execution_policy<Derived>& policy,
             ItemsIt                    last,
             CompareOp                  compare_op)
 {
-  if (__THRUST_HAS_CUDART__)
-  {
-    typedef typename thrust::iterator_value<ItemsIt>::type item_type;
-    __smart_sort::smart_sort<thrust::detail::false_type, thrust::detail::true_type>(
-        policy, first, last, (item_type*)NULL, compare_op);
-  }
-  else
-  {
-#if !__THRUST_HAS_CUDART__
-    thrust::stable_sort(cvt_to_seq(derived_cast(policy)), first, last, compare_op);
-#endif
-  }
+  THRUST_CDP_DISPATCH(
+    (using item_t = thrust::iterator_value_t<ItemsIt>; item_t *null_ = nullptr;
+     __smart_sort::smart_sort<thrust::detail::false_type,
+                              thrust::detail::true_type>(policy,
+                                                         first,
+                                                         last,
+                                                         null_,
+                                                         compare_op);),
+    (thrust::stable_sort(cvt_to_seq(derived_cast(policy)),
+                         first,
+                         last,
+                         compare_op);));
 }
 
 __thrust_exec_check_disable__
@@ -1661,18 +540,18 @@ sort_by_key(execution_policy<Derived>& policy,
             ValuesIt                   values,
             CompareOp                  compare_op)
 {
-  if (__THRUST_HAS_CUDART__)
-  {
-    __smart_sort::smart_sort<thrust::detail::true_type, thrust::detail::false_type>(
-        policy, keys_first, keys_last, values, compare_op);
-  }
-  else
-  {
-#if !__THRUST_HAS_CUDART__
-    thrust::sort_by_key(
-        cvt_to_seq(derived_cast(policy)), keys_first, keys_last, values, compare_op);
-#endif
-  }
+  THRUST_CDP_DISPATCH(
+    (__smart_sort::smart_sort<thrust::detail::true_type,
+                              thrust::detail::false_type>(policy,
+                                                          keys_first,
+                                                          keys_last,
+                                                          values,
+                                                          compare_op);),
+    (thrust::sort_by_key(cvt_to_seq(derived_cast(policy)),
+                         keys_first,
+                         keys_last,
+                         values,
+                         compare_op);));
 }
 
 __thrust_exec_check_disable__
@@ -1687,18 +566,18 @@ stable_sort_by_key(execution_policy<Derived> &policy,
             ValuesIt                   values,
             CompareOp                  compare_op)
 {
-  if (__THRUST_HAS_CUDART__)
-  {
-    __smart_sort::smart_sort<thrust::detail::true_type, thrust::detail::true_type>(
-        policy, keys_first, keys_last, values, compare_op);
-  }
-  else
-  {
-#if !__THRUST_HAS_CUDART__
-    thrust::stable_sort_by_key(
-        cvt_to_seq(derived_cast(policy)), keys_first, keys_last, values, compare_op);
-#endif
-  }
+  THRUST_CDP_DISPATCH(
+    (__smart_sort::smart_sort<thrust::detail::true_type,
+                              thrust::detail::true_type>(policy,
+                                                         keys_first,
+                                                         keys_last,
+                                                         values,
+                                                         compare_op);),
+    (thrust::stable_sort_by_key(cvt_to_seq(derived_cast(policy)),
+                                keys_first,
+                                keys_last,
+                                values,
+                                compare_op);));
 }
 
 // API with default comparator
@@ -1745,5 +624,5 @@ stable_sort_by_key(
 
 
 }    // namespace cuda_cub
-THRUST_END_NS
+THRUST_NAMESPACE_END
 #endif
diff --git a/thrust/system/cuda/detail/swap_ranges.h b/thrust/system/cuda/detail/swap_ranges.h
index c8d56467b..8f9e4fa8a 100644
--- a/thrust/system/cuda/detail/swap_ranges.h
+++ b/thrust/system/cuda/detail/swap_ranges.h
@@ -26,6 +26,7 @@
  ******************************************************************************/
 #pragma once
 
+#include <thrust/detail/config.h>
 
 #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
 #include <iterator>
@@ -35,7 +36,7 @@
 #include <thrust/system/cuda/detail/parallel_for.h>
 #include <thrust/distance.h>
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
 
 namespace cuda_cub {
 
@@ -91,16 +92,11 @@ swap_ranges(execution_policy<Derived> &policy,
                                                ItemsIt2>(first1, first2),
                          num_items);
 
-  cuda_cub::throw_on_error(
-    cuda_cub::synchronize(policy)
-  , "swap_ranges: failed to synchronize"
-  );
-
   return first2 + num_items;
 }
 
 
 }    // namespace cuda_cub
 
-THRUST_END_NS
+THRUST_NAMESPACE_END
 #endif
diff --git a/thrust/system/cuda/detail/tabulate.h b/thrust/system/cuda/detail/tabulate.h
index 2e5316f4c..67edb8574 100644
--- a/thrust/system/cuda/detail/tabulate.h
+++ b/thrust/system/cuda/detail/tabulate.h
@@ -26,6 +26,7 @@
  ******************************************************************************/
 #pragma once
 
+#include <thrust/detail/config.h>
 
 #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
 #include <thrust/distance.h>
@@ -34,7 +35,7 @@
 #include <thrust/system/cuda/detail/parallel_for.h>
 #include <thrust/distance.h>
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
 namespace cuda_cub {
 
 namespace __tabulate {
@@ -75,13 +76,8 @@ tabulate(execution_policy<Derived>& policy,
   cuda_cub::parallel_for(policy,
                          functor_t(first, tabulate_op),
                          count);
-
-  cuda_cub::throw_on_error(
-    cuda_cub::synchronize(policy)
-  , "tabulate: failed to synchronize"
-  );
 }
 
 }    // namespace cuda_cub
-THRUST_END_NS
+THRUST_NAMESPACE_END
 #endif
diff --git a/thrust/system/cuda/detail/terminate.h b/thrust/system/cuda/detail/terminate.h
index d14bed2ab..226c9d5ac 100644
--- a/thrust/system/cuda/detail/terminate.h
+++ b/thrust/system/cuda/detail/terminate.h
@@ -31,8 +31,7 @@
 #include <thrust/system/cuda/detail/util.h>
 #include <cstdio>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace cuda
@@ -59,5 +58,5 @@ void terminate_with_message(const char* message)
 } // end detail
 } // end cuda
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/cuda/detail/transform.h b/thrust/system/cuda/detail/transform.h
index 85e1cf69b..3cf171a47 100644
--- a/thrust/system/cuda/detail/transform.h
+++ b/thrust/system/cuda/detail/transform.h
@@ -26,6 +26,7 @@
  ******************************************************************************/
 #pragma once
 
+#include <thrust/detail/config.h>
 
 #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
 #include <thrust/system/cuda/config.h>
@@ -35,7 +36,7 @@
 #include <thrust/system/cuda/detail/parallel_for.h>
 #include <thrust/distance.h>
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
 
 namespace cuda_cub {
 
@@ -231,11 +232,6 @@ namespace __transform {
                                              predicate),
                            num_items);
 
-    cuda_cub::throw_on_error(
-      cuda_cub::synchronize(policy)
-    , "transform: failed to synchronize"
-    );
-
     return result + num_items;
   }
 
@@ -277,11 +273,6 @@ namespace __transform {
                                               predicate),
                            num_items);
 
-    cuda_cub::throw_on_error(
-      cuda_cub::synchronize(policy)
-    , "transform: failed to synchronize"
-    );
-
     return result + num_items;
   }
 
@@ -421,5 +412,5 @@ transform(execution_policy<Derived> &policy,
 
 }    // namespace cuda_cub
 
-THRUST_END_NS
+THRUST_NAMESPACE_END
 #endif
diff --git a/thrust/system/cuda/detail/transform_reduce.h b/thrust/system/cuda/detail/transform_reduce.h
index 8cfe2ac71..60efaae5a 100644
--- a/thrust/system/cuda/detail/transform_reduce.h
+++ b/thrust/system/cuda/detail/transform_reduce.h
@@ -26,13 +26,14 @@
  ******************************************************************************/
 #pragma once
 
+#include <thrust/detail/config.h>
 
 #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
 #include <iterator>
 #include <thrust/system/cuda/detail/reduce.h>
 #include <thrust/distance.h>
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
 namespace cuda_cub {
 
 template <class Derived,
@@ -63,5 +64,5 @@ transform_reduce(execution_policy<Derived> &policy,
 }
 
 }    // namespace cuda_cub
-THRUST_END_NS
+THRUST_NAMESPACE_END
 #endif
diff --git a/thrust/system/cuda/detail/transform_scan.h b/thrust/system/cuda/detail/transform_scan.h
index 1ebfea506..8f14ca8f7 100644
--- a/thrust/system/cuda/detail/transform_scan.h
+++ b/thrust/system/cuda/detail/transform_scan.h
@@ -26,13 +26,15 @@
  ******************************************************************************/
 #pragma once
 
+#include <thrust/detail/config.h>
 
 #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
 #include <iterator>
-#include <thrust/system/cuda/detail/scan.h>
+#include <thrust/detail/type_traits.h>
 #include <thrust/distance.h>
+#include <thrust/system/cuda/detail/scan.h>
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
 
 namespace cuda_cub {
 
@@ -49,30 +51,14 @@ transform_inclusive_scan(execution_policy<Derived> &policy,
                          TransformOp                transform_op,
                          ScanOp                     scan_op)
 {
-  // the pseudocode for deducing the type of the temporary used below:
-  // 
-  // if UnaryFunction is AdaptableUnaryFunction
-  //   TemporaryType = AdaptableUnaryFunction::result_type
-  // else if OutputIterator is a "pure" output iterator
-  //   TemporaryType = InputIterator::value_type
-  // else
-  //   TemporaryType = OutputIterator::value_type
-  //
-  // XXX upon c++0x, TemporaryType needs to be:
-  // result_of_adaptable_function<UnaryFunction>::type
-  typedef typename thrust::detail::eval_if<
-    thrust::detail::has_result_type<TransformOp>::value,
-    thrust::detail::result_type<TransformOp>,
-    thrust::detail::eval_if<
-      thrust::detail::is_output_iterator<OutputIt>::value,
-      iterator_value<InputIt>,
-      iterator_value<OutputIt>
-    >
-  >::type result_type;
+  // Use the transformed input iterator's value type per https://wg21.link/P0571
+  using input_type = typename thrust::iterator_value<InputIt>::type;
+  using result_type = thrust::detail::invoke_result_t<TransformOp, input_type>;
+  using value_type = thrust::remove_cvref_t<result_type>;
 
   typedef typename iterator_traits<InputIt>::difference_type size_type;
   size_type num_items = static_cast<size_type>(thrust::distance(first, last));
-  typedef transform_input_iterator_t<result_type,
+  typedef transform_input_iterator_t<value_type,
                                      InputIt,
                                      TransformOp>
       transformed_iterator_t;
@@ -88,7 +74,7 @@ template <class Derived,
           class InputIt,
           class OutputIt,
           class TransformOp,
-          class T,
+          class InitialValueType,
           class ScanOp>
 OutputIt __host__ __device__
 transform_exclusive_scan(execution_policy<Derived> &policy,
@@ -96,30 +82,11 @@ transform_exclusive_scan(execution_policy<Derived> &policy,
                          InputIt                    last,
                          OutputIt                   result,
                          TransformOp                transform_op,
-                         T                          init,
+                         InitialValueType           init,
                          ScanOp                     scan_op)
 {
-  // the pseudocode for deducing the type of the temporary used below:
-  // 
-  // if UnaryFunction is AdaptableUnaryFunction
-  //   TemporaryType = AdaptableUnaryFunction::result_type
-  // else if OutputIterator is a "pure" output iterator
-  //   TemporaryType = InputIterator::value_type
-  // else
-  //   TemporaryType = OutputIterator::value_type
-  //
-  // XXX upon c++0x, TemporaryType needs to be:
-  // result_of_adaptable_function<UnaryFunction>::type
-
-  typedef typename thrust::detail::eval_if<
-    thrust::detail::has_result_type<TransformOp>::value,
-    thrust::detail::result_type<TransformOp>,
-    thrust::detail::eval_if<
-      thrust::detail::is_output_iterator<OutputIt>::value,
-      thrust::iterator_value<InputIt>,
-      thrust::iterator_value<OutputIt>
-    >
-  >::type result_type;
+  // Use the initial value type per https://wg21.link/P0571
+  using result_type = thrust::remove_cvref_t<InitialValueType>;
 
   typedef typename iterator_traits<InputIt>::difference_type size_type;
   size_type num_items = static_cast<size_type>(thrust::distance(first, last));
@@ -138,5 +105,5 @@ transform_exclusive_scan(execution_policy<Derived> &policy,
 
 }    // namespace cuda_cub
 
-THRUST_END_NS
+THRUST_NAMESPACE_END
 #endif
diff --git a/thrust/system/cuda/detail/uninitialized_copy.h b/thrust/system/cuda/detail/uninitialized_copy.h
index 71a72c0e9..f21b7c0d6 100644
--- a/thrust/system/cuda/detail/uninitialized_copy.h
+++ b/thrust/system/cuda/detail/uninitialized_copy.h
@@ -26,6 +26,7 @@
  ******************************************************************************/
 #pragma once
 
+#include <thrust/detail/config.h>
 
 #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
 #include <iterator>
@@ -34,7 +35,7 @@
 #include <thrust/system/cuda/detail/util.h>
 #include <thrust/system/cuda/detail/parallel_for.h>
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
 
 namespace cuda_cub {
 
@@ -86,11 +87,6 @@ uninitialized_copy_n(execution_policy<Derived> &policy,
                          functor_t(first, result),
                          count);
 
-  cuda_cub::throw_on_error(
-    cuda_cub::synchronize(policy)
-  , "uninitialized_copy_n: failed to synchronize"
-  );
-
   return result + count;
 }
 
@@ -111,5 +107,5 @@ uninitialized_copy(execution_policy<Derived>& policy,
 
 }    // namespace cuda_
 
-THRUST_END_NS
+THRUST_NAMESPACE_END
 #endif
diff --git a/thrust/system/cuda/detail/uninitialized_fill.h b/thrust/system/cuda/detail/uninitialized_fill.h
index ad990333f..96b970201 100644
--- a/thrust/system/cuda/detail/uninitialized_fill.h
+++ b/thrust/system/cuda/detail/uninitialized_fill.h
@@ -26,6 +26,7 @@
  ******************************************************************************/
 #pragma once
 
+#include <thrust/detail/config.h>
 
 #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
 #include <iterator>
@@ -34,7 +35,7 @@
 #include <thrust/system/cuda/detail/util.h>
 #include <thrust/system/cuda/detail/parallel_for.h>
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
 
 namespace cuda_cub {
 
@@ -84,11 +85,6 @@ uninitialized_fill_n(execution_policy<Derived>& policy,
                          functor_t(first, x),
                          count);
 
-  cuda_cub::throw_on_error(
-    cuda_cub::synchronize(policy)
-  , "uninitialized_fill_n: failed to synchronize"
-  );
-
   return first + count;
 }
 
@@ -109,5 +105,5 @@ uninitialized_fill(execution_policy<Derived>& policy,
 
 }    // namespace cuda_cub
 
-THRUST_END_NS
+THRUST_NAMESPACE_END
 #endif
diff --git a/thrust/system/cuda/detail/unique.h b/thrust/system/cuda/detail/unique.h
index 4683cf3e6..653ffa79a 100644
--- a/thrust/system/cuda/detail/unique.h
+++ b/thrust/system/cuda/detail/unique.h
@@ -26,23 +26,26 @@
  ******************************************************************************/
 #pragma once
 
+#include <thrust/detail/config.h>
 
 #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
-#include <thrust/system/cuda/config.h>
 
-#include <thrust/system/cuda/detail/cub/device/device_select.cuh>
-#include <thrust/system/cuda/detail/core/agent_launcher.h>
-#include <thrust/system/cuda/detail/par_to_seq.h>
+#include <thrust/advance.h>
 #include <thrust/detail/cstdint.h>
-#include <thrust/detail/temporary_array.h>
-#include <thrust/system/cuda/detail/util.h>
-#include <thrust/system/cuda/detail/get_value.h>
-#include <thrust/functional.h>
-#include <thrust/detail/mpl/math.h>
 #include <thrust/detail/minmax.h>
 #include <thrust/distance.h>
+#include <thrust/functional.h>
+#include <thrust/system/cuda/config.h>
+#include <thrust/system/cuda/detail/cdp_dispatch.h>
+#include <thrust/system/cuda/detail/core/agent_launcher.h>
+#include <thrust/system/cuda/detail/get_value.h>
+#include <thrust/system/cuda/detail/par_to_seq.h>
+#include <thrust/system/cuda/detail/util.h>
 
-THRUST_BEGIN_NS
+#include <cub/device/device_select.cuh>
+#include <cub/util_math.cuh>
+
+THRUST_NAMESPACE_BEGIN
 
 template <typename DerivedPolicy,
           typename ForwardIterator,
@@ -66,6 +69,16 @@ unique_copy(
     OutputIterator                                              result,
     BinaryPredicate                                             binary_pred);
 
+template <typename DerivedPolicy,
+          typename ForwardIterator,
+          typename BinaryPredicate>
+__host__ __device__ typename thrust::iterator_traits<ForwardIterator>::difference_type
+unique_count(
+    const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+    ForwardIterator                                             first,
+    ForwardIterator                                             last,
+    BinaryPredicate                                             binary_pred);
+
 namespace cuda_cub {
 
 // XXX  it should be possible to unify unique & unique_by_key into a single
@@ -77,15 +90,13 @@ namespace __unique {
             int                     _ITEMS_PER_THREAD = 1,
             cub::BlockLoadAlgorithm _LOAD_ALGORITHM   = cub::BLOCK_LOAD_DIRECT,
             cub::CacheLoadModifier  _LOAD_MODIFIER    = cub::LOAD_LDG,
-            cub::BlockScanAlgorithm _SCAN_ALGORITHM   = cub::BLOCK_SCAN_WARP_SCANS,
-            int                     _MIN_BLOCKS       = 1>
+            cub::BlockScanAlgorithm _SCAN_ALGORITHM   = cub::BLOCK_SCAN_WARP_SCANS>
   struct PtxPolicy
   {
     enum
     {
       BLOCK_THREADS    = _BLOCK_THREADS,
       ITEMS_PER_THREAD = _ITEMS_PER_THREAD,
-      MIN_BLOCKS       = _MIN_BLOCKS,
       ITEMS_PER_TILE   = _BLOCK_THREADS * _ITEMS_PER_THREAD,
     };
     static const cub::BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM;
@@ -95,10 +106,10 @@ namespace __unique {
 
   template<class,class>
   struct Tuning;
-  
+
   namespace mpl = thrust::detail::mpl::math;
 
-  template<class T, size_t NOMINAL_4B_ITEMS_PER_THREAD>
+  template<class T, int NOMINAL_4B_ITEMS_PER_THREAD>
   struct items_per_thread
   {
     enum
@@ -108,7 +119,7 @@ namespace __unique {
           NOMINAL_4B_ITEMS_PER_THREAD,
           mpl::max<int,
                    1,
-                   (NOMINAL_4B_ITEMS_PER_THREAD * 4 /
+                   static_cast<int>(NOMINAL_4B_ITEMS_PER_THREAD * 4 /
                     sizeof(T))>::value>::value
     };
   };
@@ -153,7 +164,7 @@ namespace __unique {
                       cub::BLOCK_SCAN_WARP_SCANS>
         type;
   };    // Tuning for sm35
-  
+
   template<class T>
   struct Tuning<sm30,T>
   {
@@ -173,7 +184,7 @@ namespace __unique {
                       cub::BLOCK_SCAN_WARP_SCANS>
         type;
   };    // Tuning for sm30
-  
+
   template <class ItemsIt,
             class ItemsOutputIt,
             class BinaryPred,
@@ -219,21 +230,21 @@ namespace __unique {
 
       union TempStorage
       {
-        struct
+        struct ScanStorage
         {
           typename BlockScan::TempStorage               scan;
           typename TilePrefixCallback::TempStorage      prefix;
           typename BlockDiscontinuityItems::TempStorage discontinuity;
-        };
+        } scan_storage;
 
         typename BlockLoadItems::TempStorage  load_items;
         shared_items_t shared_items;
-        
+
       };    // union TempStorage
     };      // struct PtxPlan
-    
+
     typedef typename core::specialize_plan_msvc10_war<PtxPlan>::type::type ptx_plan;
-   
+
     typedef typename ptx_plan::ItemsLoadIt             ItemsLoadIt;
     typedef typename ptx_plan::BlockLoadItems          BlockLoadItems;
     typedef typename ptx_plan::BlockDiscontinuityItems BlockDiscontinuityItems;
@@ -248,7 +259,7 @@ namespace __unique {
       ITEMS_PER_THREAD = ptx_plan::ITEMS_PER_THREAD,
       ITEMS_PER_TILE   = ptx_plan::ITEMS_PER_TILE
     };
-    
+
     struct impl
     {
       //---------------------------------------------------------------------
@@ -265,7 +276,7 @@ namespace __unique {
       //---------------------------------------------------------------------
       // Utility functions
       //---------------------------------------------------------------------
-      
+
       THRUST_DEVICE_FUNCTION
       shared_items_t &get_shared()
       {
@@ -342,13 +353,13 @@ namespace __unique {
 
         if (IS_FIRST_TILE)
         {
-          BlockDiscontinuityItems(temp_storage.discontinuity)
+          BlockDiscontinuityItems(temp_storage.scan_storage.discontinuity)
               .FlagHeads(selection_flags, items_loc, predicate);
         }
         else
         {
           item_type tile_predecessor = items_in[tile_base - 1];
-          BlockDiscontinuityItems(temp_storage.discontinuity)
+          BlockDiscontinuityItems(temp_storage.scan_storage.discontinuity)
               .FlagHeads(selection_flags, items_loc, predicate, tile_predecessor);
         }
 
@@ -368,7 +379,7 @@ namespace __unique {
         Size num_selections_prefix = 0;
         if (IS_FIRST_TILE)
         {
-          BlockScan(temp_storage.scan)
+          BlockScan(temp_storage.scan_storage.scan)
               .ExclusiveSum(selection_flags,
                             selection_idx,
                             num_tile_selections);
@@ -391,10 +402,10 @@ namespace __unique {
         else
         {
           TilePrefixCallback prefix_cb(tile_state,
-                                       temp_storage.prefix,
+                                       temp_storage.scan_storage.prefix,
                                        cub::Sum(),
                                        tile_idx);
-          BlockScan(temp_storage.scan)
+          BlockScan(temp_storage.scan_storage.scan)
               .ExclusiveSum(selection_flags,
                             selection_idx,
                             prefix_cb);
@@ -513,7 +524,7 @@ namespace __unique {
            num_selected_out);
     }
   };    // struct UniqueAgent
-  
+
   template <class ScanTileState,
             class NumSelectedIt,
             class Size>
@@ -552,8 +563,7 @@ namespace __unique {
             BinaryPred       binary_pred,
             NumSelectedOutIt num_selected_out,
             Size             num_items,
-            cudaStream_t     stream,
-            bool             debug_sync)
+            cudaStream_t     stream)
   {
     using core::AgentLauncher;
     using core::AgentPlan;
@@ -579,7 +589,7 @@ namespace __unique {
 
 
     int tile_size = unique_plan.items_per_tile;
-    size_t num_tiles = (num_items + tile_size - 1) / tile_size;
+    size_t num_tiles = cub::DivideAndRoundUp(num_items, tile_size);
 
     size_t vshmem_size = core::vshmem_size(unique_plan.shared_memory_size,
                                            num_tiles);
@@ -605,17 +615,17 @@ namespace __unique {
     ScanTileState tile_status;
     status =  tile_status.Init(static_cast<int>(num_tiles), allocations[0], allocation_sizes[0]);
     CUDA_CUB_RET_IF_FAIL(status);
-   
+
     num_tiles = max<size_t>(1,num_tiles);
-    init_agent ia(init_plan, num_tiles, stream, "unique_by_key::init_agent", debug_sync);
+    init_agent ia(init_plan, num_tiles, stream, "unique_by_key::init_agent");
     ia.launch(tile_status, num_tiles, num_selected_out);
     CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
-    
+
     if (num_items == 0) { return status; }
 
     char *vshmem_ptr = vshmem_size > 0 ? (char *)allocations[1] : NULL;
 
-    unique_agent ua(unique_plan, num_items, stream, vshmem_ptr, "unique_by_key::unique_agent", debug_sync);
+    unique_agent ua(unique_plan, num_items, stream, vshmem_ptr, "unique_by_key::unique_agent");
     ua.launch(items_in,
               items_out,
               binary_pred,
@@ -644,7 +654,6 @@ namespace __unique {
     size_type    num_items          = static_cast<size_type>(thrust::distance(items_first, items_last));
     size_t       temp_storage_bytes = 0;
     cudaStream_t stream             = cuda_cub::stream(policy);
-    bool         debug_sync         = THRUST_DEBUG_SYNC_FLAG;
 
     cudaError_t status;
     status = doit_step(NULL,
@@ -654,8 +663,7 @@ namespace __unique {
                        binary_pred,
                        reinterpret_cast<size_type*>(NULL),
                        num_items,
-                       stream,
-                       debug_sync);
+                       stream);
     cuda_cub::throw_on_error(status, "unique: failed on 1st step");
 
     size_t allocation_sizes[2] = {sizeof(size_type), temp_storage_bytes};
@@ -689,8 +697,7 @@ namespace __unique {
                        binary_pred,
                        d_num_selected_out,
                        num_items,
-                       stream,
-                       debug_sync);
+                       stream);
     cuda_cub::throw_on_error(status, "unique: failed on 2nd step");
 
     status = cuda_cub::synchronize(policy);
@@ -718,26 +725,14 @@ unique_copy(execution_policy<Derived> &policy,
             OutputIt                   result,
             BinaryPred                 binary_pred)
 {
-  OutputIt ret = result;
-  if (__THRUST_HAS_CUDART__)
-  {
-    ret = __unique::unique(policy,
-                           first,
-                           last,
-                           result,
-                           binary_pred);
-  }
-  else
-  {
-#if !__THRUST_HAS_CUDART__
-    ret = thrust::unique_copy(cvt_to_seq(derived_cast(policy)),
-                              first,
-                              last,
-                              result,
-                              binary_pred);
-#endif
-  }
-  return ret;
+  THRUST_CDP_DISPATCH(
+    (result = __unique::unique(policy, first, last, result, binary_pred);),
+    (result = thrust::unique_copy(cvt_to_seq(derived_cast(policy)),
+                                  first,
+                                  last,
+                                  result,
+                                  binary_pred);));
+  return result;
 }
 
 template <class Derived,
@@ -757,44 +752,68 @@ unique_copy(execution_policy<Derived> &policy,
 
 __thrust_exec_check_disable__
 template <class Derived,
-          class InputIt,
+          class ForwardIt,
           class BinaryPred>
-InputIt __host__ __device__
+ForwardIt __host__ __device__
 unique(execution_policy<Derived> &policy,
-       InputIt                    first,
-       InputIt                    last,
+       ForwardIt                  first,
+       ForwardIt                  last,
        BinaryPred                 binary_pred)
 {
-  InputIt ret = first;
-  if (__THRUST_HAS_CUDART__)
-  {
-    ret = cuda_cub::unique_copy(policy, first, last, first, binary_pred);
-  }
-  else
-  {
-#if !__THRUST_HAS_CUDART__
-    ret = thrust::unique(cvt_to_seq(derived_cast(policy)),
-                         first,
-                         last,
-                         binary_pred);
-#endif
-  }
+  ForwardIt ret = first;
+  THRUST_CDP_DISPATCH(
+    (ret = cuda_cub::unique_copy(policy, first, last, first, binary_pred);),
+    (ret = thrust::unique(cvt_to_seq(derived_cast(policy)),
+                          first,
+                          last,
+                          binary_pred);));
   return ret;
 }
 
 template <class Derived,
-          class InputIt>
-InputIt __host__ __device__
+          class ForwardIt>
+ForwardIt __host__ __device__
 unique(execution_policy<Derived> &policy,
-       InputIt                    first,
-       InputIt                    last)
+       ForwardIt                  first,
+       ForwardIt                  last)
 {
-  typedef typename iterator_traits<InputIt>::value_type input_type;
+  typedef typename iterator_traits<ForwardIt>::value_type input_type;
   return cuda_cub::unique(policy, first, last, equal_to<input_type>());
 }
 
+
+template <typename BinaryPred>
+struct zip_adj_not_predicate {
+  template <typename TupleType>
+  bool __host__ __device__ operator()(TupleType&& tuple) {
+      return !binary_pred(thrust::get<0>(tuple), thrust::get<1>(tuple));
+  }
+  
+  BinaryPred binary_pred;
+};
+
+
+__thrust_exec_check_disable__
+template <class Derived,
+          class ForwardIt,
+          class BinaryPred>
+typename thrust::iterator_traits<ForwardIt>::difference_type
+__host__ __device__
+unique_count(execution_policy<Derived> &policy,
+       ForwardIt                  first,
+       ForwardIt                  last,
+       BinaryPred                 binary_pred)
+{
+  if (first == last) {
+    return 0;
+  }
+  auto size = thrust::distance(first, last);
+  auto it = thrust::make_zip_iterator(thrust::make_tuple(first, thrust::next(first)));
+  return 1 + thrust::count_if(policy, it, thrust::next(it, size - 1), zip_adj_not_predicate<BinaryPred>{binary_pred});
+}
+
 }    // namespace cuda_cub
-THRUST_END_NS
+THRUST_NAMESPACE_END
 
 //
 #include <thrust/memory.h>
diff --git a/thrust/system/cuda/detail/unique_by_key.h b/thrust/system/cuda/detail/unique_by_key.h
index 209af4ece..d5ce8e786 100644
--- a/thrust/system/cuda/detail/unique_by_key.h
+++ b/thrust/system/cuda/detail/unique_by_key.h
@@ -26,25 +26,29 @@
  ******************************************************************************/
 #pragma once
 
+#include <thrust/detail/config.h>
 
 #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
-#include <thrust/system/cuda/config.h>
 
+#include <thrust/detail/alignment.h>
 #include <thrust/detail/cstdint.h>
 #include <thrust/detail/temporary_array.h>
-#include <thrust/system/cuda/detail/util.h>
-#include <thrust/system/cuda/detail/cub/device/device_select.cuh>
+#include <thrust/detail/minmax.h>
+#include <thrust/detail/mpl/math.h>
+#include <thrust/distance.h>
+#include <thrust/functional.h>
+#include <thrust/pair.h>
+#include <thrust/system/cuda/config.h>
+#include <thrust/system/cuda/detail/cdp_dispatch.h>
 #include <thrust/system/cuda/detail/core/agent_launcher.h>
 #include <thrust/system/cuda/detail/get_value.h>
 #include <thrust/system/cuda/detail/par_to_seq.h>
-#include <thrust/functional.h>
-#include <thrust/pair.h>
-#include <thrust/detail/mpl/math.h>
-#include <thrust/detail/minmax.h>
-#include <thrust/distance.h>
-#include <thrust/detail/alignment.h>
+#include <thrust/system/cuda/detail/util.h>
 
-THRUST_BEGIN_NS
+#include <cub/device/device_select.cuh>
+#include <cub/util_math.cuh>
+
+THRUST_NAMESPACE_BEGIN
 
 template <typename DerivedPolicy,
           typename ForwardIterator1,
@@ -81,15 +85,13 @@ namespace __unique_by_key {
             int                     _ITEMS_PER_THREAD = 1,
             cub::BlockLoadAlgorithm _LOAD_ALGORITHM   = cub::BLOCK_LOAD_DIRECT,
             cub::CacheLoadModifier  _LOAD_MODIFIER    = cub::LOAD_LDG,
-            cub::BlockScanAlgorithm _SCAN_ALGORITHM   = cub::BLOCK_SCAN_WARP_SCANS,
-            int                     _MIN_BLOCKS       = 1>
+            cub::BlockScanAlgorithm _SCAN_ALGORITHM   = cub::BLOCK_SCAN_WARP_SCANS>
   struct PtxPolicy
   {
     enum
     {
       BLOCK_THREADS    = _BLOCK_THREADS,
       ITEMS_PER_THREAD = _ITEMS_PER_THREAD,
-      MIN_BLOCKS       = _MIN_BLOCKS,
       ITEMS_PER_TILE   = _BLOCK_THREADS * _ITEMS_PER_THREAD,
     };
     static const cub::BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM;
@@ -99,7 +101,7 @@ namespace __unique_by_key {
 
   template<class,class>
   struct Tuning;
-  
+
   namespace mpl = thrust::detail::mpl::math;
 
   template<class T, size_t NOMINAL_4B_ITEMS_PER_THREAD>
@@ -109,11 +111,11 @@ namespace __unique_by_key {
     {
       value = mpl::min<
           int,
-          NOMINAL_4B_ITEMS_PER_THREAD,
+          static_cast<int>(NOMINAL_4B_ITEMS_PER_THREAD),
           mpl::max<int,
                    1,
-                   (NOMINAL_4B_ITEMS_PER_THREAD * 4 /
-                    sizeof(T))>::value>::value
+                   static_cast<int>(NOMINAL_4B_ITEMS_PER_THREAD * 4 /
+                   sizeof(T))>::value>::value
     };
   };
 
@@ -137,7 +139,7 @@ namespace __unique_by_key {
                       cub::BLOCK_SCAN_WARP_SCANS>
         type;
   };    // Tuning for sm52
-  
+
   template<class T>
   struct Tuning<sm35,T>
   {
@@ -157,7 +159,7 @@ namespace __unique_by_key {
                       cub::BLOCK_SCAN_WARP_SCANS>
         type;
   };    // Tuning for sm35
-  
+
   template<class T>
   struct Tuning<sm30,T>
   {
@@ -177,7 +179,7 @@ namespace __unique_by_key {
                       cub::BLOCK_SCAN_WARP_SCANS>
         type;
   };    // Tuning for sm30
-  
+
   template <class KeyInputIt,
             class ValInputIt,
             class KeyOutputIt,
@@ -230,12 +232,12 @@ namespace __unique_by_key {
 
       union TempStorage
       {
-        struct
+        struct ScanStorage
         {
           typename BlockScan::TempStorage              scan;
           typename TilePrefixCallback::TempStorage     prefix;
           typename BlockDiscontinuityKeys::TempStorage discontinuity;
-        };
+        } scan_storage;
 
         typename BlockLoadKeys::TempStorage   load_keys;
         typename BlockLoadValues::TempStorage load_values;
@@ -337,7 +339,7 @@ namespace __unique_by_key {
 
         sync_threadblock();
       }
-      
+
       //---------------------------------------------------------------------
       // Tile processing
       //---------------------------------------------------------------------
@@ -393,13 +395,13 @@ namespace __unique_by_key {
 
         if (IS_FIRST_TILE)
         {
-          BlockDiscontinuityKeys(temp_storage.discontinuity)
+          BlockDiscontinuityKeys(temp_storage.scan_storage.discontinuity)
               .FlagHeads(selection_flags, keys, predicate);
         }
         else
         {
           key_type tile_predecessor = keys_in[tile_base - 1];
-          BlockDiscontinuityKeys(temp_storage.discontinuity)
+          BlockDiscontinuityKeys(temp_storage.scan_storage.discontinuity)
               .FlagHeads(selection_flags, keys, predicate, tile_predecessor);
         }
 #pragma unroll
@@ -418,7 +420,7 @@ namespace __unique_by_key {
         Size num_selections_prefix = 0;
         if (IS_FIRST_TILE)
         {
-          BlockScan(temp_storage.scan)
+          BlockScan(temp_storage.scan_storage.scan)
               .ExclusiveSum(selection_flags,
                             selection_idx,
                             num_tile_selections);
@@ -441,10 +443,10 @@ namespace __unique_by_key {
         else
         {
           TilePrefixCallback prefix_cb(tile_state,
-                                       temp_storage.prefix,
+                                       temp_storage.scan_storage.prefix,
                                        cub::Sum(),
                                        tile_idx);
-          BlockScan(temp_storage.scan)
+          BlockScan(temp_storage.scan_storage.scan)
               .ExclusiveSum(selection_flags,
                             selection_idx,
                             prefix_cb);
@@ -632,8 +634,7 @@ namespace __unique_by_key {
             BinaryPred       binary_pred,
             NumSelectedOutIt num_selected_out,
             Size             num_items,
-            cudaStream_t     stream,
-            bool             debug_sync)
+            cudaStream_t     stream)
   {
     using core::AgentLauncher;
     using core::AgentPlan;
@@ -648,7 +649,7 @@ namespace __unique_by_key {
                          Size,
                          NumSelectedOutIt> >
         unique_agent;
-    
+
     typedef typename unique_agent::ScanTileState ScanTileState;
 
     typedef AgentLauncher<
@@ -661,7 +662,7 @@ namespace __unique_by_key {
 
 
     int tile_size = unique_plan.items_per_tile;
-    size_t num_tiles = (num_items + tile_size - 1) / tile_size;
+    size_t num_tiles = cub::DivideAndRoundUp(num_items, tile_size);
 
     size_t vshmem_size = core::vshmem_size(unique_plan.shared_memory_size,
                                            num_tiles);
@@ -687,17 +688,17 @@ namespace __unique_by_key {
     ScanTileState tile_status;
     status =  tile_status.Init(static_cast<int>(num_tiles), allocations[0], allocation_sizes[0]);
     CUDA_CUB_RET_IF_FAIL(status);
-   
+
     num_tiles = max<size_t>(1,num_tiles);
-    init_agent ia(init_plan, num_tiles, stream, "unique_by_key::init_agent", debug_sync);
+    init_agent ia(init_plan, num_tiles, stream, "unique_by_key::init_agent");
     ia.launch(tile_status, num_tiles, num_selected_out);
     CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
-    
-    if (num_items == 0) { return status; } 
+
+    if (num_items == 0) { return status; }
 
     char *vshmem_ptr = vshmem_size > 0 ? (char *)allocations[1] : NULL;
 
-    unique_agent ua(unique_plan, num_items, stream, vshmem_ptr, "unique_by_key::unique_agent", debug_sync);
+    unique_agent ua(unique_plan, num_items, stream, vshmem_ptr, "unique_by_key::unique_agent");
     ua.launch(keys_in,
               values_in,
               keys_out,
@@ -730,12 +731,11 @@ namespace __unique_by_key {
 
     typedef int size_type;
 
-    size_type num_items 
+    size_type num_items
       = static_cast<size_type>(thrust::distance(keys_first, keys_last));
 
     size_t       temp_storage_bytes = 0;
     cudaStream_t stream             = cuda_cub::stream(policy);
-    bool         debug_sync         = THRUST_DEBUG_SYNC_FLAG;
 
     cudaError_t status;
     status = __unique_by_key::doit_step(NULL,
@@ -747,8 +747,7 @@ namespace __unique_by_key {
                                         binary_pred,
                                         reinterpret_cast<size_type*>(NULL),
                                         num_items,
-                                        stream,
-                                        debug_sync);
+                                        stream);
     cuda_cub::throw_on_error(status, "unique_by_key: failed on 1st step");
 
     size_t allocation_sizes[2] = {sizeof(size_type), temp_storage_bytes};
@@ -784,8 +783,7 @@ namespace __unique_by_key {
                                         binary_pred,
                                         d_num_selected_out,
                                         num_items,
-                                        stream,
-                                        debug_sync);
+                                        stream);
     cuda_cub::throw_on_error(status, "unique_by_key: failed on 2nd step");
 
     status = cuda_cub::synchronize(policy);
@@ -823,29 +821,22 @@ unique_by_key_copy(execution_policy<Derived> &policy,
                    ValOutputIt                values_result,
                    BinaryPred                 binary_pred)
 {
-  pair<KeyOutputIt, ValOutputIt> ret = thrust::make_pair(keys_result, values_result);
-  if (__THRUST_HAS_CUDART__)
-  {
-    ret = __unique_by_key::unique_by_key(policy,
-                                keys_first,
-                                keys_last,
-                                values_first,
-                                keys_result,
-                                values_result,
-                                binary_pred);
-  }
-  else
-  {
-#if !__THRUST_HAS_CUDART__
-    ret = thrust::unique_by_key_copy(cvt_to_seq(derived_cast(policy)),
-                                     keys_first,
-                                     keys_last,
-                                     values_first,
-                                     keys_result,
-                                     values_result,
-                                     binary_pred);
-#endif
-  }
+  auto ret = thrust::make_pair(keys_result, values_result);
+  THRUST_CDP_DISPATCH(
+    (ret = __unique_by_key::unique_by_key(policy,
+                                          keys_first,
+                                          keys_last,
+                                          values_first,
+                                          keys_result,
+                                          values_result,
+                                          binary_pred);),
+    (ret = thrust::unique_by_key_copy(cvt_to_seq(derived_cast(policy)),
+                                      keys_first,
+                                      keys_last,
+                                      values_first,
+                                      keys_result,
+                                      values_result,
+                                      binary_pred);));
   return ret;
 }
 
@@ -883,27 +874,20 @@ unique_by_key(execution_policy<Derived> &policy,
               ValInputIt                 values_first,
               BinaryPred                 binary_pred)
 {
-  pair<KeyInputIt, ValInputIt> ret = thrust::make_pair(keys_first, values_first);
-  if (__THRUST_HAS_CUDART__)
-  {
-    ret = cuda_cub::unique_by_key_copy(policy,
-                                       keys_first,
-                                       keys_last,
-                                       values_first,
-                                       keys_first,
-                                       values_first,
-                                       binary_pred);
-  }
-  else
-  {
-#if !__THRUST_HAS_CUDART__
-    ret = thrust::unique_by_key(cvt_to_seq(derived_cast(policy)),
-                                keys_first,
-                                keys_last,
-                                values_first,
-                                binary_pred);
-#endif
-  }
+  auto ret = thrust::make_pair(keys_first, values_first);
+  THRUST_CDP_DISPATCH(
+    (ret = cuda_cub::unique_by_key_copy(policy,
+                                         keys_first,
+                                         keys_last,
+                                         values_first,
+                                         keys_first,
+                                         values_first,
+                                         binary_pred);),
+    (ret = thrust::unique_by_key(cvt_to_seq(derived_cast(policy)),
+                                  keys_first,
+                                  keys_last,
+                                  values_first,
+                                  binary_pred);));
   return ret;
 }
 
@@ -927,7 +911,7 @@ unique_by_key(execution_policy<Derived> &policy,
 
 
 }    // namespace cuda_cub
-THRUST_END_NS
+THRUST_NAMESPACE_END
 
 #include <thrust/memory.h>
 #include <thrust/unique.h>
diff --git a/thrust/system/cuda/detail/util.h b/thrust/system/cuda/detail/util.h
index 7e6df7b8c..6d9e3681d 100644
--- a/thrust/system/cuda/detail/util.h
+++ b/thrust/system/cuda/detail/util.h
@@ -27,25 +27,43 @@
 #pragma once
 
 #include <cstdio>
+#include <exception>
 #include <thrust/detail/config.h>
 #include <thrust/iterator/iterator_traits.h>
-#include <thrust/system/cuda/detail/cub/util_arch.cuh>
 #include <thrust/system/cuda/detail/execution_policy.h>
 #include <thrust/system_error.h>
 #include <thrust/system/cuda/error.h>
 
-THRUST_BEGIN_NS
+#include <cub/detail/device_synchronize.cuh>
+#include <cub/util_arch.cuh>
+#include <cub/util_device.cuh>
 
+#include <nv/target>
+
+THRUST_NAMESPACE_BEGIN
 namespace cuda_cub {
 
-template <class Policy>
+inline __host__ __device__
+cudaStream_t
+default_stream()
+{
+#ifdef CUDA_API_PER_THREAD_DEFAULT_STREAM
+  return cudaStreamPerThread;
+#else
+  return cudaStreamLegacy;
+#endif
+}
+
+// Fallback implementation of the customization point.
+template <class Derived>
 __host__ __device__
-cudaError_t
-synchronize(Policy &policy)
+cudaStream_t
+get_stream(execution_policy<Derived> &)
 {
-  return synchronize_stream(derived_cast(policy));
+  return default_stream();
 }
 
+// Entry point/interface.
 template <class Derived>
 __host__ __device__ cudaStream_t
 stream(execution_policy<Derived> &policy)
@@ -54,35 +72,72 @@ stream(execution_policy<Derived> &policy)
 }
 
 
-#if 0
-template <class Policy, class Type>
-CUB_RUNTIME_FUNCTION cudaError_t
-trivial_copy_from_device(Policy &    policy,
-                         Type *      dst,
-                         Type const *src,
-                         size_t      count)
+// Fallback implementation of the customization point.
+template <class Derived>
+__host__ __device__
+bool
+must_perform_optional_stream_synchronization(execution_policy<Derived> &)
 {
-  cudaError status = cudaSuccess;
-  if (count == 0) return status;
-#ifdef __CUDA_ARCH__
-  for (size_t i = 0; i != count; ++i)
+  return true;
+}
+
+// Entry point/interface.
+template <class Derived>
+__host__ __device__ bool
+must_perform_optional_synchronization(execution_policy<Derived> &policy)
+{
+  return must_perform_optional_stream_synchronization(derived_cast(policy));
+}
+
+
+// Fallback implementation of the customization point.
+__thrust_exec_check_disable__
+template <class Derived>
+__host__ __device__
+cudaError_t
+synchronize_stream(execution_policy<Derived> &policy)
+{
+  return cub::SyncStream(stream(policy));
+}
+
+// Entry point/interface.
+template <class Policy>
+__host__ __device__
+cudaError_t
+synchronize(Policy &policy)
+{
+  return synchronize_stream(derived_cast(policy));
+}
+
+// Fallback implementation of the customization point.
+__thrust_exec_check_disable__
+template <class Derived>
+__host__ __device__
+cudaError_t
+synchronize_stream_optional(execution_policy<Derived> &policy)
+{
+  cudaError_t result;
+
+  if (must_perform_optional_synchronization(policy))
   {
-    dst[i] = src[i];
+    result = synchronize_stream(policy);
+  }
+  else
+  {
+    result = cudaSuccess;
   }
-#else
-  cudaStream_t stream = cuda_cub::stream(policy);
-  //
-  status = ::cudaMemcpyAsync(dst,
-                             src,
-                             sizeof(Type) * count,
-                             cudaMemcpyDeviceToHost,
-                             stream);
-  cuda_cub::synchronize(policy);
 
-#endif
-  return status;
+  return result;
+}
+
+// Entry point/interface.
+template <class Policy>
+__host__ __device__
+cudaError_t
+synchronize_optional(Policy &policy)
+{
+  return synchronize_stream_optional(derived_cast(policy));
 }
-#endif
 
 template <class Type>
 THRUST_HOST_FUNCTION cudaError_t
@@ -103,34 +158,6 @@ trivial_copy_from_device(Type *       dst,
   return status;
 }
 
-#if 0
-template <class Policy, class Type>
-CUB_RUNTIME_FUNCTION cudaError_t
-trivial_copy_to_device(Policy &    ,
-                       Type *      dst,
-                       Type const *src,
-                       size_t      count)
-{
-  cudaError status = cudaSuccess;
-  if (count == 0) return status;
-#ifdef __CUDA_ARCH__
-  for (size_t i = 0; i != count; ++i)
-  {
-    dst[i] = src[i];
-  }
-#else
-  cudaStream_t stream = cuda_cub::stream(policy);
-  //
-  status = ::cudaMemcpyAsync(dst,
-                             src,
-                             sizeof(Type) * count,
-                             cudaMemcpyHostToDevice,
-                             stream);
-  cuda_cub::synchronize(policy);
-#endif
-  return status;
-}
-#else
 template <class Type>
 THRUST_HOST_FUNCTION cudaError_t
 trivial_copy_to_device(Type *       dst,
@@ -149,8 +176,6 @@ trivial_copy_to_device(Type *       dst,
   cudaStreamSynchronize(stream);
   return status;
 }
-#endif
-
 
 template <class Policy, class Type>
 __host__ __device__ cudaError_t
@@ -173,59 +198,101 @@ trivial_copy_device_to_device(Policy &    policy,
   return status;
 }
 
-
 inline void __host__ __device__
 terminate()
 {
-#ifdef __CUDA_ARCH__
-  asm("trap;");
-#else
-  std::terminate();
-#endif
+  NV_IF_TARGET(NV_IS_HOST, (std::terminate();), (asm("trap;");));
 }
 
 __host__  __device__
 inline void throw_on_error(cudaError_t status)
 {
+  // Clear the global CUDA error state which may have been set by the last
+  // call. Otherwise, errors may "leak" to unrelated kernel launches.
+#ifdef THRUST_RDC_ENABLED
+  cudaGetLastError();
+#else
+  NV_IF_TARGET(NV_IS_HOST, (cudaGetLastError();));
+#endif
+
   if (cudaSuccess != status)
   {
-#if !defined(__CUDA_ARCH__)
-    throw thrust::system_error(status, thrust::cuda_category());
-#else
-#if __THRUST_HAS_CUDART__
-    printf("Thrust CUDA backend error: %s\n",
-           cudaGetErrorString(status));
+
+    // Can't use #if inside NV_IF_TARGET, use a temp macro to hoist the device
+    // instructions out of the target logic.
+#ifdef THRUST_RDC_ENABLED
+
+#define THRUST_TEMP_DEVICE_CODE \
+  printf("Thrust CUDA backend error: %s: %s\n", \
+         cudaGetErrorName(status), \
+         cudaGetErrorString(status))
+
 #else
-    printf("Thrust CUDA backend error: %d\n",
-           static_cast<int>(status));
-#endif
-    cuda_cub::terminate();
+
+#define THRUST_TEMP_DEVICE_CODE \
+  printf("Thrust CUDA backend error: %d\n", \
+         static_cast<int>(status))
+
 #endif
+
+    NV_IF_TARGET(NV_IS_HOST, (
+      throw thrust::system_error(status, thrust::cuda_category());
+    ), (
+      THRUST_TEMP_DEVICE_CODE;
+      cuda_cub::terminate();
+    ));
+
+#undef THRUST_TEMP_DEVICE_CODE
+
   }
 }
 
-__host__ __device__ 
+__host__ __device__
 inline void throw_on_error(cudaError_t status, char const *msg)
 {
+  // Clear the global CUDA error state which may have been set by the last
+  // call. Otherwise, errors may "leak" to unrelated kernel launches.
+#ifdef THRUST_RDC_ENABLED
+  cudaGetLastError();
+#else
+  NV_IF_TARGET(NV_IS_HOST, (cudaGetLastError();));
+#endif
+
   if (cudaSuccess != status)
   {
-#if !defined(__CUDA_ARCH__)
-    throw thrust::system_error(status, thrust::cuda_category(), msg);
-#else
-#if __THRUST_HAS_CUDART__
-    printf("Thrust CUDA backend error: %s: %s\n",
-           cudaGetErrorString(status),
-           msg);
+    // Can't use #if inside NV_IF_TARGET, use a temp macro to hoist the device
+    // instructions out of the target logic.
+#ifdef THRUST_RDC_ENABLED
+
+#define THRUST_TEMP_DEVICE_CODE \
+  printf("Thrust CUDA backend error: %s: %s: %s\n", \
+         cudaGetErrorName(status), \
+         cudaGetErrorString(status),\
+         msg)
+
 #else
-    printf("Thrust CUDA backend error: %d: %s \n",
-           static_cast<int>(status),
-           msg);
-#endif
-    cuda_cub::terminate();
+
+#define THRUST_TEMP_DEVICE_CODE \
+  printf("Thrust CUDA backend error: %d: %s\n", \
+         static_cast<int>(status),              \
+         msg)
+
 #endif
+
+    NV_IF_TARGET(NV_IS_HOST, (
+      throw thrust::system_error(status, thrust::cuda_category(), msg);
+    ), (
+      THRUST_TEMP_DEVICE_CODE;
+      cuda_cub::terminate();
+    ));
+
+#undef THRUST_TEMP_DEVICE_CODE
+
   }
 }
 
+// FIXME: Move the iterators elsewhere.
+
 template <class ValueType,
           class InputIt,
           class UnaryOp>
@@ -245,6 +312,19 @@ struct transform_input_iterator_t
   transform_input_iterator_t(InputIt input, UnaryOp op)
       : input(input), op(op) {}
 
+#if THRUST_CPP_DIALECT >= 2011
+  transform_input_iterator_t(const self_t &) = default;
+#endif
+
+  // UnaryOp might not be copy assignable, such as when it is a lambda.  Define
+  // an explicit copy assignment operator that doesn't try to assign it.
+  __host__ __device__ 
+  self_t& operator=(const self_t& o)
+  {
+    input = o.input;
+    return *this;
+  }
+
   /// Postfix increment
   __host__ __device__ __forceinline__ self_t operator++(int)
   {
@@ -311,14 +391,6 @@ struct transform_input_iterator_t
     return op(input[n]);
   }
 
-#if 0
-    /// Structure dereference
-    __host__ __device__ __forceinline__ pointer operator->()
-    {
-        return &op(*input_itr);
-    }
-#endif
-
   /// Equal to
   __host__ __device__ __forceinline__ bool operator==(const self_t &rhs) const
   {
@@ -330,14 +402,6 @@ struct transform_input_iterator_t
   {
     return (input != rhs.input);
   }
-
-#if 0
-    /// ostream operator
-    friend std::ostream& operator<<(std::ostream& os, const self& itr)
-    {
-        return os;
-    }
-#endif
 };    // struct transform_input_iterarot_t
 
 template <class ValueType,
@@ -363,6 +427,20 @@ struct transform_pair_of_input_iterators_t
                                       BinaryOp op_)
       : input1(input1_), input2(input2_), op(op_) {}
 
+#if THRUST_CPP_DIALECT >= 2011
+  transform_pair_of_input_iterators_t(const self_t &) = default;
+#endif
+
+  // BinaryOp might not be copy assignable, such as when it is a lambda.
+  // Define an explicit copy assignment operator that doesn't try to assign it.
+  __host__ __device__
+  self_t& operator=(const self_t& o)
+  {
+    input1 = o.input1;
+    input2 = o.input2;
+    return *this;
+  }
+
   /// Postfix increment
   __host__ __device__ __forceinline__ self_t operator++(int)
   {
@@ -445,121 +523,6 @@ struct transform_pair_of_input_iterators_t
 
 };    // struct transform_pair_of_input_iterators_t
 
-template <class ValueType,
-          class InputIt1,
-          class InputIt2,
-          class InputIt3,
-          class TransformOp>
-struct transform_triple_of_input_iterators_t
-{
-  typedef transform_triple_of_input_iterators_t               self_t;
-  typedef typename iterator_traits<InputIt1>::difference_type difference_type;
-  typedef ValueType                                           value_type;
-  typedef value_type *                                        pointer;
-  typedef value_type                                          reference;
-  typedef std::random_access_iterator_tag                     iterator_category;
-
-  InputIt1            input1;
-  InputIt2            input2;
-  InputIt3            input3;
-  mutable TransformOp op;
-
-  __host__ __device__ __forceinline__
-  transform_triple_of_input_iterators_t(InputIt1    input1_,
-                                        InputIt2    input2_,
-                                        InputIt3    input3_,
-                                        TransformOp op_)
-      : input1(input1_), input2(input2_), input3(input3_), op(op_) {}
-
-  /// Postfix increment
-  __host__ __device__ __forceinline__ self_t operator++(int)
-  {
-    self_t retval = *this;
-    ++input1;
-    ++input2;
-    ++input3;
-    return retval;
-  }
-
-  /// Prefix increment
-  __host__ __device__ __forceinline__ self_t operator++()
-  {
-    ++input1;
-    ++input2;
-    ++input3;
-    return *this;
-  }
-
-  /// Indirection
-  __host__ __device__ __forceinline__ reference operator*() const
-  {
-    return op(*input1, *input2, *input3);
-  }
-  /// Indirection
-  __host__ __device__ __forceinline__ reference operator*()
-  {
-    return op(*input1, *input2, *input3);
-  }
-
-  /// Addition
-  __host__ __device__ __forceinline__ self_t operator+(difference_type n) const
-  {
-    return self_t(input1 + n, input2 + n, input3 + n, op);
-  }
-
-  /// Addition assignment
-  __host__ __device__ __forceinline__ self_t &operator+=(difference_type n)
-  {
-    input1 += n;
-    input2 += n;
-    input3 += n;
-    return *this;
-  }
-
-  /// Subtraction
-  __host__ __device__ __forceinline__ self_t operator-(difference_type n) const
-  {
-    return self_t(input1 - n, input2 - n, input3 - n, op);
-  }
-
-  /// Subtraction assignment
-  __host__ __device__ __forceinline__ self_t &operator-=(difference_type n)
-  {
-    input1 -= n;
-    input2 -= n;
-    input3 -= n;
-    return *this;
-  }
-
-  /// Distance
-  __host__ __device__ __forceinline__ difference_type operator-(self_t other) const
-  {
-    return input1 - other.input1;
-  }
-
-  /// Array subscript
-  __host__ __device__ __forceinline__ reference operator[](difference_type n) const
-  {
-    return op(input1[n], input2[n], input3[n]);
-  }
-
-  /// Equal to
-  __host__ __device__ __forceinline__ bool operator==(const self_t &rhs) const
-  {
-    return (input1 == rhs.input1) &&
-           (input2 == rhs.input2) &&
-           (input3 == rhs.input3);
-  }
-
-  /// Not equal to
-  __host__ __device__ __forceinline__ bool operator!=(const self_t &rhs) const
-  {
-    return (input1 != rhs.input1) ||
-           (input2 != rhs.input2) ||
-           (input3 != rhs.input3);
-  }
-
-};    // struct trasnform_triple_of_input_iterators_t
 
 struct identity
 {
@@ -578,208 +541,6 @@ struct identity
   }
 };
 
-template <class ValueType,
-          class OutputIt,
-          class TransformOp = identity>
-struct transform_output_iterator_t
-{
-  struct proxy_reference
-  {
-  private:
-    OutputIt    output;
-    TransformOp op;
-
-  public:
-    __host__ __device__
-    proxy_reference(OutputIt const &output_, TransformOp op_)
-        : output(output_), op(op_) {}
-
-    proxy_reference __host__ __device__
-    operator=(ValueType const &x)
-    {
-      *output = op(x);
-      return *this;
-    }
-  };
-
-  typedef transform_output_iterator_t                         self_t;
-  typedef typename iterator_traits<OutputIt>::difference_type difference_type;
-  typedef void                                                value_type;
-  typedef proxy_reference                                     reference;
-  typedef std::output_iterator_tag                            iterator_category;
-
-  OutputIt    output;
-  TransformOp op;
-
-  __host__ __device__ __forceinline__
-  transform_output_iterator_t(OutputIt output)
-      : output(output) {}
-
-  __host__ __device__ __forceinline__
-  transform_output_iterator_t(OutputIt output, TransformOp op)
-      : output(output), op(op) {}
-
-  /// Postfix increment
-  __host__ __device__ __forceinline__ self_t operator++(int)
-  {
-    self_t retval = *this;
-    ++output;
-    return retval;
-  }
-
-  /// Prefix increment
-  __host__ __device__ __forceinline__ self_t operator++()
-  {
-    ++output;
-    return *this;
-  }
-
-  /// Indirection
-  __host__ __device__ __forceinline__ reference operator*() const
-  {
-    return proxy_reference(output, op);
-  }
-  /// Indirection
-  __host__ __device__ __forceinline__ reference operator*()
-  {
-    return proxy_reference(output, op);
-  }
-
-  /// Addition
-  __host__ __device__ __forceinline__ self_t operator+(difference_type n) const
-  {
-    return self_t(output + n, op);
-  }
-
-  /// Addition assignment
-  __host__ __device__ __forceinline__ self_t &operator+=(difference_type n)
-  {
-    output += n;
-    return *this;
-  }
-
-  /// Subtraction
-  __host__ __device__ __forceinline__ self_t operator-(difference_type n) const
-  {
-    return self_t(output - n, op);
-  }
-
-  /// Subtraction assignment
-  __host__ __device__ __forceinline__ self_t &operator-=(difference_type n)
-  {
-    output -= n;
-    return *this;
-  }
-
-  /// Distance
-  __host__ __device__ __forceinline__ difference_type operator-(self_t other) const
-  {
-    return output - other.output;
-  }
-
-  /// Array subscript
-  __host__ __device__ __forceinline__ reference operator[](difference_type n) const
-  {
-    return *(output + n);
-  }
-
-  /// Equal to
-  __host__ __device__ __forceinline__ bool operator==(const self_t &rhs) const
-  {
-    return (output == rhs.output);
-  }
-
-  /// Not equal to
-  __host__ __device__ __forceinline__ bool operator!=(const self_t &rhs) const
-  {
-    return (output != rhs.output);
-  }
-};    // struct transform_output_iterator_
-
-template <class T, T VALUE>
-struct static_integer_iterator
-{
-  typedef static_integer_iterator         self_t;
-  typedef int                             difference_type;
-  typedef T                               value_type;
-  typedef T                               reference;
-  typedef std::random_access_iterator_tag iterator_category;
-
-  __host__ __device__ __forceinline__
-  static_integer_iterator() {}
-
-  /// Postfix increment
-  __host__ __device__ __forceinline__ self_t operator++(int)
-  {
-    return *this;
-  }
-
-  /// Prefix increment
-  __host__ __device__ __forceinline__ self_t operator++()
-  {
-    return *this;
-  }
-
-  /// Indirection
-  __host__ __device__ __forceinline__ reference operator*() const
-  {
-    return VALUE;
-  }
-  /// Indirection
-  __host__ __device__ __forceinline__ reference operator*()
-  {
-    return VALUE;
-  }
-
-  /// Addition
-  __host__ __device__ __forceinline__ self_t operator+(difference_type ) const
-  {
-    return self_t();
-  }
-
-  /// Addition assignment
-  __host__ __device__ __forceinline__ self_t &operator+=(difference_type )
-  {
-    return *this;
-  }
-
-  /// Subtraction
-  __host__ __device__ __forceinline__ self_t operator-(difference_type ) const
-  {
-    return self_t();
-  }
-
-  /// Subtraction assignment
-  __host__ __device__ __forceinline__ self_t &operator-=(difference_type )
-  {
-    return *this;
-  }
-
-  /// Distance
-  __host__ __device__ __forceinline__ difference_type operator-(self_t ) const
-  {
-    return 0;
-  }
-
-  /// Array subscript
-  __host__ __device__ __forceinline__ reference operator[](difference_type ) const
-  {
-    return VALUE;
-  }
-
-  /// Equal to
-  __host__ __device__ __forceinline__ bool operator==(const self_t &) const
-  {
-    return true;
-  }
-
-  /// Not equal to
-  __host__ __device__ __forceinline__ bool operator!=(const self_t &) const
-  {
-    return false;
-  }
-
-};    // struct static_bool_iterator
 
 template <class T>
 struct counting_iterator_t
@@ -875,7 +636,6 @@ struct counting_iterator_t
 
 };    // struct count_iterator_t
 
-
 }    // cuda_
 
-THRUST_END_NS
+THRUST_NAMESPACE_END
diff --git a/thrust/system/cuda/detail/vector.inl b/thrust/system/cuda/detail/vector.inl
deleted file mode 100644
index 81941d62f..000000000
--- a/thrust/system/cuda/detail/vector.inl
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in ccudaliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/vector.h>
-
-namespace thrust
-{
-namespace cuda_cub
-{
-
-template<typename T, typename Allocator>
-  vector<T,Allocator>
-    ::vector()
-      : super_t()
-{}
-
-template<typename T, typename Allocator>
-  vector<T,Allocator>
-    ::vector(size_type n)
-      : super_t(n)
-{}
-
-template<typename T, typename Allocator>
-  vector<T,Allocator>
-    ::vector(size_type n, const value_type &value)
-      : super_t(n,value)
-{}
-
-template<typename T, typename Allocator>
-  vector<T,Allocator>
-    ::vector(const vector &x)
-      : super_t(x)
-{}
-
-template<typename T, typename Allocator>
-  template<typename OtherT, typename OtherAllocator>
-    vector<T,Allocator>
-      ::vector(const thrust::detail::vector_base<OtherT,OtherAllocator> &x)
-        : super_t(x)
-{}
-
-template<typename T, typename Allocator>
-  template<typename OtherT, typename OtherAllocator>
-    vector<T,Allocator>
-      ::vector(const std::vector<OtherT,OtherAllocator> &x)
-        : super_t(x)
-{}
-
-template<typename T, typename Allocator>
-  template<typename InputIterator>
-    vector<T,Allocator>
-      ::vector(InputIterator first, InputIterator last)
-        : super_t(first,last)
-{}
-
-template<typename T, typename Allocator>
-  template<typename OtherT, typename OtherAllocator>
-    vector<T,Allocator> &
-      vector<T,Allocator>
-        ::operator=(const std::vector<OtherT,OtherAllocator> &x)
-{
-  super_t::operator=(x);
-  return *this;
-}
-
-template<typename T, typename Allocator>
-  template<typename OtherT, typename OtherAllocator>
-    vector<T,Allocator> &
-      vector<T,Allocator>
-        ::operator=(const thrust::detail::vector_base<OtherT,OtherAllocator> &x)
-{
-  super_t::operator=(x);
-  return *this;
-}
-      
-} // end cuda_cub
-} // end thrust
-
diff --git a/thrust/system/cuda/error.h b/thrust/system/cuda/error.h
index dcbadd855..b180f8347 100644
--- a/thrust/system/cuda/error.h
+++ b/thrust/system/cuda/error.h
@@ -26,8 +26,7 @@
 #include <thrust/system/error_code.h>
 #include <thrust/system/cuda/detail/guarded_driver_types.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace system
 {
@@ -56,7 +55,6 @@ enum errc_t
   memory_allocation                  = cudaErrorMemoryAllocation,
   initialization_error               = cudaErrorInitializationError,
   launch_failure                     = cudaErrorLaunchFailure,
-  prior_launch_failure               = cudaErrorPriorLaunchFailure,
   launch_timeout                     = cudaErrorLaunchTimeout,
   launch_out_of_resources            = cudaErrorLaunchOutOfResources,
   invalid_device_function            = cudaErrorInvalidDeviceFunction,
@@ -67,23 +65,14 @@ enum errc_t
   invalid_symbol                     = cudaErrorInvalidSymbol,
   map_buffer_object_failed           = cudaErrorMapBufferObjectFailed,
   unmap_buffer_object_failed         = cudaErrorUnmapBufferObjectFailed,
-  invalid_host_pointer               = cudaErrorInvalidHostPointer,
-  invalid_device_pointer             = cudaErrorInvalidDevicePointer,
   invalid_texture                    = cudaErrorInvalidTexture,
   invalid_texture_binding            = cudaErrorInvalidTextureBinding,
   invalid_channel_descriptor         = cudaErrorInvalidChannelDescriptor,
   invalid_memcpy_direction           = cudaErrorInvalidMemcpyDirection,
-  address_of_constant_error          = cudaErrorAddressOfConstant,
-  texture_fetch_failed               = cudaErrorTextureFetchFailed,
-  texture_not_bound                  = cudaErrorTextureNotBound,
-  synchronization_error              = cudaErrorSynchronizationError,
   invalid_filter_setting             = cudaErrorInvalidFilterSetting,
   invalid_norm_setting               = cudaErrorInvalidNormSetting,
-  mixed_device_execution             = cudaErrorMixedDeviceExecution,
   cuda_runtime_unloading             = cudaErrorCudartUnloading,
   unknown                            = cudaErrorUnknown,
-  not_yet_implemented                = cudaErrorNotYetImplemented,
-  memory_value_too_large             = cudaErrorMemoryValueTooLarge,
   invalid_resource_handle            = cudaErrorInvalidResourceHandle,
   not_ready                          = cudaErrorNotReady,
   insufficient_driver                = cudaErrorInsufficientDriver,
@@ -177,7 +166,7 @@ namespace errc = system::cuda::errc;
 
 using system::cuda_category;
 
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/cuda/detail/error.inl>
 
diff --git a/thrust/system/cuda/execution_policy.h b/thrust/system/cuda/execution_policy.h
index 39bbb7927..c171ac3d9 100644
--- a/thrust/system/cuda/execution_policy.h
+++ b/thrust/system/cuda/execution_policy.h
@@ -26,59 +26,6 @@
  ******************************************************************************/
 #pragma once
 
-// histogram
-// sort (radix-sort, merge-sort)
-
 #include <thrust/detail/config.h>
 #include <thrust/system/cuda/detail/execution_policy.h>
 #include <thrust/system/cuda/detail/par.h>
-
-// pass
-// ----------------
-#include <thrust/system/cuda/detail/adjacent_difference.h>
-#include <thrust/system/cuda/detail/copy.h>
-#include <thrust/system/cuda/detail/copy_if.h>
-#include <thrust/system/cuda/detail/count.h>
-#include <thrust/system/cuda/detail/equal.h>
-#include <thrust/system/cuda/detail/extrema.h>
-#include <thrust/system/cuda/detail/fill.h>
-#include <thrust/system/cuda/detail/find.h>
-#include <thrust/system/cuda/detail/for_each.h>
-#include <thrust/system/cuda/detail/gather.h>
-#include <thrust/system/cuda/detail/generate.h>
-#include <thrust/system/cuda/detail/inner_product.h>
-#include <thrust/system/cuda/detail/mismatch.h>
-#include <thrust/system/cuda/detail/partition.h>
-#include <thrust/system/cuda/detail/reduce_by_key.h>
-#include <thrust/system/cuda/detail/remove.h>
-#include <thrust/system/cuda/detail/replace.h>
-#include <thrust/system/cuda/detail/reverse.h>
-#include <thrust/system/cuda/detail/scatter.h>
-#include <thrust/system/cuda/detail/swap_ranges.h>
-#include <thrust/system/cuda/detail/tabulate.h>
-#include <thrust/system/cuda/detail/transform.h>
-#include <thrust/system/cuda/detail/transform_reduce.h>
-#include <thrust/system/cuda/detail/transform_scan.h>
-#include <thrust/system/cuda/detail/uninitialized_copy.h>
-#include <thrust/system/cuda/detail/uninitialized_fill.h>
-#include <thrust/system/cuda/detail/unique.h>
-#include <thrust/system/cuda/detail/unique_by_key.h>
-
-// fail
-// ----------------
-// fails with mixed types
-#include <thrust/system/cuda/detail/reduce.h>
-
-// mixed types are not compiling, commented in testing/scan.cu
-#include <thrust/system/cuda/detail/scan.h>
-
-// stubs passed
-// ----------------
-#include <thrust/system/cuda/detail/binary_search.h>
-#include <thrust/system/cuda/detail/merge.h>
-#include <thrust/system/cuda/detail/scan_by_key.h>
-#include <thrust/system/cuda/detail/set_operations.h>
-#include <thrust/system/cuda/detail/sort.h>
-
-// work in progress
-
diff --git a/thrust/system/cuda/experimental/pinned_allocator.h b/thrust/system/cuda/experimental/pinned_allocator.h
deleted file mode 100644
index e03a0d921..000000000
--- a/thrust/system/cuda/experimental/pinned_allocator.h
+++ /dev/null
@@ -1,240 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-/*! \file thrust/system/cuda/experimental/pinned_allocator.h
- *  \brief An allocator which creates new elements in "pinned" memory with \p cudaMallocHost
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/guarded_cuda_runtime_api.h>
-#include <stdexcept>
-#include <limits>
-#include <string>
-#include <thrust/system/system_error.h>
-#include <thrust/system/cuda/error.h>
-
-namespace thrust
-{
-
-namespace system
-{
-
-namespace cuda
-{
-
-namespace experimental
-{
-
-/*! \addtogroup memory_management_classes
- *  \ingroup memory_management
- *  \{
- */
-
-/*! \p pinned_allocator is a CUDA-specific host memory allocator
- *  that employs \c cudaMallocHost for allocation.
- *
- *  \see http://www.sgi.com/tech/stl/Allocators.html
- */
-template<typename T> class pinned_allocator;
-
-template<>
-  class pinned_allocator<void>
-{
-  public:
-    typedef void           value_type;
-    typedef void       *   pointer;
-    typedef const void *   const_pointer;
-    typedef std::size_t    size_type;
-    typedef std::ptrdiff_t difference_type;
-
-    // convert a pinned_allocator<void> to pinned_allocator<U>
-    template<typename U>
-      struct rebind
-    {
-      typedef pinned_allocator<U> other;
-    }; // end rebind
-}; // end pinned_allocator
-
-
-template<typename T>
-  class pinned_allocator
-{
-  public:
-    //! \{
-    typedef T              value_type;
-    typedef T*             pointer;
-    typedef const T*       const_pointer;
-    typedef T&             reference;
-    typedef const T&       const_reference;
-    typedef std::size_t    size_type;
-    typedef std::ptrdiff_t difference_type;
-    //! \}
-
-    // convert a pinned_allocator<T> to pinned_allocator<U>
-    template<typename U>
-      struct rebind
-    {
-      typedef pinned_allocator<U> other;
-    }; // end rebind
-
-    /*! \p pinned_allocator's null constructor does nothing.
-     */
-    __host__ __device__
-    inline pinned_allocator() {}
-
-    /*! \p pinned_allocator's null destructor does nothing.
-     */
-    __host__ __device__
-    inline ~pinned_allocator() {}
-
-    /*! \p pinned_allocator's copy constructor does nothing.
-     */
-    __host__ __device__
-    inline pinned_allocator(pinned_allocator const &) {}
-
-    /*! This version of \p pinned_allocator's copy constructor
-     *  is templated on the \c value_type of the \p pinned_allocator
-     *  to copy from.  It is provided merely for convenience; it
-     *  does nothing.
-     */
-    template<typename U>
-    __host__ __device__
-    inline pinned_allocator(pinned_allocator<U> const &) {}
-
-    /*! This method returns the address of a \c reference of
-     *  interest.
-     *
-     *  \p r The \c reference of interest.
-     *  \return \c r's address.
-     */
-    __host__ __device__
-    inline pointer address(reference r) { return &r; }
-
-    /*! This method returns the address of a \c const_reference
-     *  of interest.
-     *
-     *  \p r The \c const_reference of interest.
-     *  \return \c r's address.
-     */
-    __host__ __device__
-    inline const_pointer address(const_reference r) { return &r; }
-
-    /*! This method allocates storage for objects in pinned host
-     *  memory.
-     *
-     *  \p cnt The number of objects to allocate.
-     *  \return a \c pointer to the newly allocated objects.
-     *  \note This method does not invoke \p value_type's constructor.
-     *        It is the responsibility of the caller to initialize the
-     *        objects at the returned \c pointer. 
-     */
-    __host__
-    inline pointer allocate(size_type cnt,
-                            const_pointer = 0)
-    {
-      if(cnt > this->max_size())
-      {
-        throw std::bad_alloc();
-      } // end if
-
-      pointer result(0);
-      cudaError_t error = cudaMallocHost(reinterpret_cast<void**>(&result), cnt * sizeof(value_type));
-
-      if(error)
-      {
-        throw std::bad_alloc();
-      } // end if
-
-      return result;
-    } // end allocate()
-
-    /*! This method deallocates pinned host memory previously allocated
-     *  with this \c pinned_allocator.
-     *
-     *  \p p A \c pointer to the previously allocated memory.
-     *  \p cnt The number of objects previously allocated at
-     *         \p p.
-     *  \note This method does not invoke \p value_type's destructor.
-     *        It is the responsibility of the caller to destroy
-     *        the objects stored at \p p.
-     */
-    __host__
-    inline void deallocate(pointer p, size_type /*cnt*/)
-    {
-      cudaError_t error = cudaFreeHost(p);
-      
-      if(error)
-      {
-        throw thrust::system_error(error, thrust::cuda_category());
-      } // end if
-    } // end deallocate()
-
-    /*! This method returns the maximum size of the \c cnt parameter
-     *  accepted by the \p allocate() method.
-     *
-     *  \return The maximum number of objects that may be allocated
-     *          by a single call to \p allocate().
-     */
-    inline size_type max_size() const
-    {
-      return (std::numeric_limits<size_type>::max)() / sizeof(T);
-    } // end max_size()
-
-    /*! This method tests this \p pinned_allocator for equality to
-     *  another.
-     *
-     *  \param x The other \p pinned_allocator of interest.
-     *  \return This method always returns \c true.
-     */
-    __host__ __device__
-    inline bool operator==(pinned_allocator const& x) const { return true; }
-
-    /*! This method tests this \p pinned_allocator for inequality
-     *  to another.
-     *
-     *  \param x The other \p pinned_allocator of interest.
-     *  \return This method always returns \c false.
-     */
-    __host__ __device__
-    inline bool operator!=(pinned_allocator const &x) const { return !operator==(x); }
-}; // end pinned_allocator
-
-/*! \}
- */
-
-} // end experimental
-
-} // end cuda
-
-} // end system
-
-// alias cuda's members at top-level
-namespace cuda
-{
-
-namespace experimental
-{
-
-using thrust::system::cuda::experimental::pinned_allocator;
-
-} // end experimental
-
-} // end cuda
-
-} // end thrust
-
diff --git a/thrust/system/cuda/future.h b/thrust/system/cuda/future.h
index 4709f16a2..79bfc9134 100644
--- a/thrust/system/cuda/future.h
+++ b/thrust/system/cuda/future.h
@@ -6,15 +6,14 @@
 #pragma once
 
 #include <thrust/detail/config.h>
-#include <thrust/detail/cpp11_required.h>
-#include <thrust/detail/modern_gcc_required.h>
+#include <thrust/detail/cpp14_required.h>
 
-#if THRUST_CPP_DIALECT >= 2011 && !defined(THRUST_LEGACY_GCC)
+#if THRUST_CPP_DIALECT >= 2014
 
 #include <thrust/system/cuda/pointer.h>
 #include <thrust/system/cuda/detail/execution_policy.h>
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
 
 namespace system { namespace cuda
 {
@@ -66,9 +65,9 @@ unique_eager_future_type(
   thrust::cuda::execution_policy<DerivedPolicy> const&
 ) noexcept;
 
-THRUST_END_NS
+THRUST_NAMESPACE_END
 
 #include <thrust/system/cuda/detail/future.inl>
 
-#endif
+#endif // C++14
 
diff --git a/thrust/system/cuda/memory.h b/thrust/system/cuda/memory.h
index 2e9c6080a..eb8020adb 100644
--- a/thrust/system/cuda/memory.h
+++ b/thrust/system/cuda/memory.h
@@ -2,7 +2,7 @@
  *  Copyright 2008-2018 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in ccudaliance with the License.
+ *  you may not use this file except in compliance with the License.
  *  You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
@@ -27,8 +27,9 @@
 #include <thrust/mr/allocator.h>
 #include <ostream>
 
-THRUST_BEGIN_NS
-namespace cuda_cub {
+THRUST_NAMESPACE_BEGIN
+namespace cuda_cub
+{
 
 /*! Allocates an area of memory available to Thrust's <tt>cuda</tt> system.
  *  \param n Number of bytes to allocate.
@@ -63,80 +64,46 @@ inline __host__ __device__ pointer<T> malloc(std::size_t n);
  */
 inline __host__ __device__ void free(pointer<void> ptr);
 
-// XXX upon c++11
-// template<typename T>
-// using allocator = thrust::mr::stateless_resource_allocator<T, memory_resource>;
-
-/*! \p cuda::allocator is the default allocator used by the \p cuda system's containers such as
- *  <tt>cuda::vector</tt> if no user-specified allocator is provided. \p cuda::allocator allocates
- *  (deallocates) storage with \p cuda::malloc (\p cuda::free).
+/*! \p cuda::allocator is the default allocator used by the \p cuda system's
+ *  containers such as <tt>cuda::vector</tt> if no user-specified allocator is
+ *  provided. \p cuda::allocator allocates (deallocates) storage with \p
+ *  cuda::malloc (\p cuda::free).
  */
-template <typename T>
-struct allocator
-    : thrust::mr::stateless_resource_allocator<
-        T,
-        system::cuda::memory_resource
-    >
-{
-private:
-    typedef thrust::mr::stateless_resource_allocator<
-        T,
-        system::cuda::memory_resource
-    > base;
-
-public:
-  /*! The \p rebind metafunction provides the type of an \p allocator
-   *  instantiated with another type.
-   *
-   *  \tparam U The other type to use for instantiation.
-   */
-  template <typename U>
-  struct rebind
-  {
-    /*! The typedef \p other gives the type of the rebound \p allocator.
-     */
-    typedef allocator<U> other;
-  };
-
-  /*! No-argument constructor has no effect.
-   */
-  __host__ __device__
-  inline allocator() {}
-
-  /*! Copy constructor has no effect.
-   */
-  __host__ __device__
- inline allocator(const allocator & other) : base(other) {}
+template<typename T>
+using allocator = thrust::mr::stateless_resource_allocator<
+  T, thrust::system::cuda::memory_resource
+>;
 
-  /*! Constructor from other \p allocator has no effect.
-   */
-  template <typename U>
-  __host__ __device__
-  inline allocator(const allocator<U> & other) : base(other) {}
-
-  /*! Destructor has no effect.
-   */
-  __host__ __device__
-  inline ~allocator() {}
-};    // struct allocator
+/*! \p cuda::universal_allocator allocates memory that can be used by the \p cuda
+ *  system and host systems.
+ */
+template<typename T>
+using universal_allocator = thrust::mr::stateless_resource_allocator<
+  T, thrust::system::cuda::universal_memory_resource
+>;
 
-}    // namespace cuda_cub
+} // namespace cuda_cub
 
-namespace system {
-namespace cuda {
+namespace system { namespace cuda
+{
 using thrust::cuda_cub::malloc;
 using thrust::cuda_cub::free;
 using thrust::cuda_cub::allocator;
-} // namespace cuda
-} // namespace system
+using thrust::cuda_cub::universal_allocator;
+}} // namespace system::cuda
 
-namespace cuda {
+/*! \namespace thrust::cuda
+ *  \brief \p thrust::cuda is a top-level alias for \p thrust::system::cuda.
+ */
+namespace cuda
+{
 using thrust::cuda_cub::malloc;
 using thrust::cuda_cub::free;
 using thrust::cuda_cub::allocator;
-}    // end cuda
+using thrust::cuda_cub::universal_allocator;
+} // namespace cuda
 
-THRUST_END_NS
+THRUST_NAMESPACE_END
 
 #include <thrust/system/cuda/detail/memory.inl>
 
diff --git a/thrust/system/cuda/memory_resource.h b/thrust/system/cuda/memory_resource.h
index 4c78ba213..4bf534e40 100644
--- a/thrust/system/cuda/memory_resource.h
+++ b/thrust/system/cuda/memory_resource.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2018 NVIDIA Corporation
+ *  Copyright 2018-2020 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -20,6 +20,8 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/mr/memory_resource.h>
 #include <thrust/system/cuda/detail/guarded_cuda_runtime_api.h>
 #include <thrust/system/cuda/pointer.h>
@@ -27,9 +29,9 @@
 #include <thrust/system/cuda/error.h>
 #include <thrust/system/cuda/detail/util.h>
 
-#include <thrust/memory/detail/host_system_resource.h>
+#include <thrust/mr/host_memory_resource.h>
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
 
 namespace system
 {
@@ -40,14 +42,14 @@ namespace cuda
 namespace detail
 {
 
-    typedef cudaError_t (*allocation_fn)(void **, std::size_t);
-    typedef cudaError_t (*deallocation_fn)(void *);
+    typedef cudaError_t (CUDARTAPI *allocation_fn)(void **, std::size_t);
+    typedef cudaError_t (CUDARTAPI *deallocation_fn)(void *);
 
     template<allocation_fn Alloc, deallocation_fn Dealloc, typename Pointer>
-    class cuda_memory_resource THRUST_FINAL : public mr::memory_resource<Pointer>
+    class cuda_memory_resource final : public mr::memory_resource<Pointer>
     {
     public:
-        Pointer do_allocate(std::size_t bytes, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT) THRUST_OVERRIDE
+        Pointer do_allocate(std::size_t bytes, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT) override
         {
             (void)alignment;
 
@@ -56,13 +58,14 @@ namespace detail
 
             if (status != cudaSuccess)
             {
+                cudaGetLastError(); // Clear the CUDA global error state.
                 throw thrust::system::detail::bad_alloc(thrust::cuda_category().message(status).c_str());
             }
 
             return Pointer(ret);
         }
 
-        void do_deallocate(Pointer p, std::size_t bytes, std::size_t alignment) THRUST_OVERRIDE
+        void do_deallocate(Pointer p, std::size_t bytes, std::size_t alignment) override
         {
             (void)bytes;
             (void)alignment;
@@ -76,7 +79,7 @@ namespace detail
         }
     };
 
-    inline cudaError_t cudaMallocManaged(void ** ptr, std::size_t bytes)
+    inline cudaError_t CUDARTAPI cudaMallocManaged(void ** ptr, std::size_t bytes)
     {
         return ::cudaMallocManaged(ptr, bytes, cudaMemAttachGlobal);
     }
@@ -85,24 +88,39 @@ namespace detail
         thrust::cuda::pointer<void> >
         device_memory_resource;
     typedef detail::cuda_memory_resource<detail::cudaMallocManaged, cudaFree,
-        thrust::cuda::pointer<void> >
+        thrust::cuda::universal_pointer<void> >
         managed_memory_resource;
     typedef detail::cuda_memory_resource<cudaMallocHost, cudaFreeHost,
-        thrust::host_memory_resource::pointer>
+        thrust::cuda::universal_pointer<void> >
         pinned_memory_resource;
 
 } // end detail
 //! \endcond
 
-/*! The memory resource for the CUDA system. Uses <tt>cudaMalloc</tt> and wraps the result with \p cuda::pointer. */
+/*! The memory resource for the CUDA system. Uses <tt>cudaMalloc</tt> and wraps
+ *  the result with \p cuda::pointer.
+ */
 typedef detail::device_memory_resource memory_resource;
-/*! The universal memory resource for the CUDA system. Uses <tt>cudaMallocManaged</tt> and wraps the result with \p cuda::pointer. */
+/*! The universal memory resource for the CUDA system. Uses
+ *  <tt>cudaMallocManaged</tt> and wraps the result with
+ *  \p cuda::universal_pointer.
+ */
 typedef detail::managed_memory_resource universal_memory_resource;
-/*! The host pinned memory resource for the CUDA system. Uses <tt>cudaMallocHost</tt> and wraps the result with \p cuda::pointer. */
+/*! The host pinned memory resource for the CUDA system. Uses
+ *  <tt>cudaMallocHost</tt> and wraps the result with \p
+ *  cuda::universal_pointer.
+ */
 typedef detail::pinned_memory_resource universal_host_pinned_memory_resource;
 
 } // end cuda
 } // end system
 
-THRUST_END_NS
+namespace cuda
+{
+using thrust::system::cuda::memory_resource;
+using thrust::system::cuda::universal_memory_resource;
+using thrust::system::cuda::universal_host_pinned_memory_resource;
+}
+
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/cuda/pointer.h b/thrust/system/cuda/pointer.h
index f198385ce..ace77fbae 100644
--- a/thrust/system/cuda/pointer.h
+++ b/thrust/system/cuda/pointer.h
@@ -1,8 +1,8 @@
 /*
- *  Copyright 2008-2018 NVIDIA Corporation
+ *  Copyright 2008-2020 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in ccudaliance with the License.
+ *  you may not use this file except in compliance with the License.
  *  You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
@@ -14,76 +14,37 @@
  *  limitations under the License.
  */
 
+/*! \file thrust/system/cuda/memory.h
+ *  \brief Managing memory associated with Thrust's Standard C++ system.
+ */
+
 #pragma once
 
 #include <thrust/detail/config.h>
+#include <type_traits>
 #include <thrust/system/cuda/detail/execution_policy.h>
-#include <thrust/detail/type_traits.h>
 #include <thrust/detail/pointer.h>
 #include <thrust/detail/reference.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace cuda_cub
 {
 
-template <typename>
-class pointer;
-
-} // end cuda_cub
-} // end thrust
-
-
-// specialize thrust::iterator_traits to avoid problems with the name of
-// pointer's constructor shadowing its nested pointer type
-// do this before pointer is defined so the specialization is correctly
-// used inside the definition
-namespace thrust
-{
-
-template <typename Element>
-struct iterator_traits<thrust::cuda_cub::pointer<Element> >
-{
-private:
-  typedef thrust::cuda_cub::pointer<Element> ptr;
-
-public:
-  typedef typename ptr::iterator_category iterator_category;
-  typedef typename ptr::value_type        value_type;
-  typedef typename ptr::difference_type   difference_type;
-  typedef ptr                             pointer;
-  typedef typename ptr::reference         reference;
-};    // end iterator_traits
-
-namespace cuda_cub {
-
-// forward declaration of reference for pointer
-template <typename Element>
-class reference;
-
-// XXX nvcc + msvc have trouble instantiating reference below
-//     this is a workaround
-template <typename Element>
-struct reference_msvc_workaround
-{
-  typedef thrust::cuda_cub::reference<Element> type;
-};    // end reference_msvc_workaround
-
-
-/*! \p pointer stores a pointer to an object allocated in memory available to the cuda system.
- *  This type provides type safety when dispatching standard algorithms on ranges resident
- *  in cuda memory.
+/*! \p cuda::pointer stores a pointer to an object allocated in memory
+ *  accessible by the \p cuda system. This type provides type safety when
+ *  dispatching algorithms on ranges resident in \p cuda memory.
  *
- *  \p pointer has pointer semantics: it may be dereferenced and manipulated with pointer arithmetic.
+ *  \p cuda::pointer has pointer semantics: it may be dereferenced and
+ *  manipulated with pointer arithmetic.
  *
- *  \p pointer can be created with the function \p cuda::malloc, or by explicitly calling its constructor
- *  with a raw pointer.
+ *  \p cuda::pointer can be created with the function \p cuda::malloc, or by
+ *  explicitly calling its constructor with a raw pointer.
  *
- *  The raw pointer encapsulated by a \p pointer may be obtained by eiter its <tt>get</tt> member function
- *  or the \p raw_pointer_cast function.
+ *  The raw pointer encapsulated by a \p cuda::pointer may be obtained by eiter
+ *  its <tt>get</tt> member function or the \p raw_pointer_cast function.
  *
- *  \note \p pointer is not a "smart" pointer; it is the programmer's responsibility to deallocate memory
- *  pointed to by \p pointer.
+ *  \note \p cuda::pointer is not a "smart" pointer; it is the programmer's
+ *        responsibility to deallocate memory pointed to by \p cuda::pointer.
  *
  *  \tparam T specifies the type of the pointee.
  *
@@ -92,198 +53,53 @@ struct reference_msvc_workaround
  *  \see raw_pointer_cast
  */
 template <typename T>
-class pointer
-    : public thrust::pointer<
-          T,
-          thrust::cuda_cub::tag,
-          thrust::cuda_cub::reference<T>,
-          thrust::cuda_cub::pointer<T> >
-{
-
-private:
-  typedef thrust::pointer<
-      T,
-      thrust::cuda_cub::tag,
-      typename reference_msvc_workaround<T>::type,
-      thrust::cuda_cub::pointer<T> >
-      super_t;
-
-public:
-  /*! \p pointer's no-argument constructor initializes its encapsulated pointer to \c 0.
-   */
-  __host__ __device__
-  pointer() : super_t() {}
-
-  #if THRUST_CPP_DIALECT >= 2011
-  // NOTE: This is needed so that Thrust smart pointers can be used in
-  // `std::unique_ptr`.
-  __host__ __device__
-  pointer(decltype(nullptr)) : super_t(nullptr) {}
-  #endif
-
-  /*! This constructor allows construction of a <tt>pointer<const T></tt> from a <tt>T*</tt>.
-   *
-   *  \param ptr A raw pointer to copy from, presumed to point to a location in memory
-   *         accessible by the \p cuda system.
-   *  \tparam OtherT \p OtherT shall be convertible to \p T.
-   */
-  template <typename OtherT>
-  __host__ __device__ explicit pointer(OtherT *ptr) : super_t(ptr)
-  {
-  }
-
-  /*! This constructor allows construction from another pointer-like object with related type.
-   *
-   *  \param other The \p OtherPointer to copy.
-   *  \tparam OtherPointer The system tag associated with \p OtherPointer shall be convertible
-   *          to \p thrust::system::cuda::tag and its element type shall be convertible to \p T.
-   */
-  template <typename OtherPointer>
-  __host__ __device__
-  pointer(const OtherPointer &other,
-          typename thrust::detail::enable_if_pointer_is_convertible<
-              OtherPointer,
-              pointer>::type * = 0) : super_t(other)
-  {
-  }
-
-  /*! This constructor allows construction from another pointer-like object with \p void type.
-   *
-   *  \param other The \p OtherPointer to copy.
-   *  \tparam OtherPointer The system tag associated with \p OtherPointer shall be convertible
-   *          to \p thrust::system::cuda::tag and its element type shall be \p void.
-   */
-  template <typename OtherPointer>
-  __host__ __device__
-  explicit
-  pointer(const OtherPointer &other,
-          typename thrust::detail::enable_if_void_pointer_is_system_convertible<
-              OtherPointer,
-              pointer>::type * = 0) : super_t(other)
-  {
-  }
-
-  /*! Assignment operator allows assigning from another pointer-like object with related type.
-   *
-   *  \param other The other pointer-like object to assign from.
-   *  \tparam OtherPointer The system tag associated with \p OtherPointer shall be convertible
-   *          to \p thrust::system::cuda::tag and its element type shall be convertible to \p T.
-   */
-  template <typename OtherPointer>
-  __host__ __device__
-      typename thrust::detail::enable_if_pointer_is_convertible<
-          OtherPointer,
-          pointer,
-          pointer &>::type
-      operator=(const OtherPointer &other)
-  {
-    return super_t::operator=(other);
-  }
-
-  #if THRUST_CPP_DIALECT >= 2011
-  // NOTE: This is needed so that Thrust smart pointers can be used in
-  // `std::unique_ptr`.
-  __host__ __device__
-  pointer& operator=(decltype(nullptr))
-  {
-    super_t::operator=(nullptr);
-    return *this;
-  }
-  #endif
-};    // struct pointer
-
-/*! \p reference is a wrapped reference to an object stored in memory available to the \p cuda system.
- *  \p reference is the type of the result of dereferencing a \p cuda::pointer.
+using pointer = thrust::pointer<
+  T,
+  thrust::cuda_cub::tag,
+  thrust::tagged_reference<T, thrust::cuda_cub::tag>
+>;
+
+/*! \p cuda::universal_pointer stores a pointer to an object allocated in
+ *  memory accessible by the \p cuda system and host systems.
  *
- *  \tparam T Specifies the type of the referenced object.
+ *  \p cuda::universal_pointer has pointer semantics: it may be dereferenced
+ *  and manipulated with pointer arithmetic.
+ *
+ *  \p cuda::universal_pointer can be created with \p cuda::universal_allocator
+ *  or by explicitly calling its constructor with a raw pointer.
+ *
+ *  The raw pointer encapsulated by a \p cuda::universal_pointer may be
+ *  obtained by eiter its <tt>get</tt> member function or the \p
+ *  raw_pointer_cast function.
+ *
+ *  \note \p cuda::universal_pointer is not a "smart" pointer; it is the
+ *        programmer's responsibility to deallocate memory pointed to by
+ *        \p cuda::universal_pointer.
+ *
+ *  \tparam T specifies the type of the pointee.
+ *
+ *  \see cuda::universal_allocator
+ *  \see raw_pointer_cast
  */
 template <typename T>
-class reference
-    : public thrust::reference<
-          T,
-          thrust::cuda_cub::pointer<T>,
-          thrust::cuda_cub::reference<T> >
-{
-
-private:
-  typedef thrust::reference<
-      T,
-      thrust::cuda_cub::pointer<T>,
-      thrust::cuda_cub::reference<T> >
-      super_t;
-
-public:
-  /*! \cond
-   */
-
-  typedef typename super_t::value_type value_type;
-  typedef typename super_t::pointer    pointer;
-
-  /*! \endcond
-   */
-
-  /*! This constructor initializes this \p reference to refer to an object
-   *  pointed to by the given \p pointer. After this \p reference is constructed,
-   *  it shall refer to the object pointed to by \p ptr.
-   *
-   *  \param ptr A \p pointer to copy from.
-   */
-  __host__ __device__ explicit reference(const pointer &ptr)
-      : super_t(ptr)
-  {
-  }
-
-  /*! This constructor accepts a const reference to another \p reference of related type.
-   *  After this \p reference is constructed, it shall refer to the same object as \p other.
-   *
-   *  \param other A \p reference to copy from.
-   *  \tparam OtherT The element type of the other \p reference.
-   *
-   *  \note This constructor is templated primarily to allow initialization of <tt>reference<const T></tt>
-   *        from <tt>reference<T></tt>.
-   */
-  template <typename OtherT>
-  __host__ __device__
-  reference(const reference<OtherT> &other,
-            typename thrust::detail::enable_if_convertible<
-                typename reference<OtherT>::pointer,
-                pointer>::type * = 0)
-      : super_t(other)
-  {
-  }
-
-  /*! Copy assignment operator copy assigns from another \p reference of related type.
-   *
-   *  \param other The other \p reference to assign from.
-   *  \return <tt>*this</tt>
-   *  \tparam OtherT The element type of the other \p reference.
-   */
-  template <typename OtherT>
-  __host__ __device__
-      reference &
-      operator=(const reference<OtherT> &other);
-
-  /*! Assignment operator assigns from a \p value_type.
-   *
-   *  \param x The \p value_type to assign from.
-   *  \return <tt>*this</tt>
-   */
-  __host__ __device__
-      reference &
-      operator=(const value_type &x);
-};    // struct reference
-
-/*! Exchanges the values of two objects referred to by \p reference.
- *  \p x The first \p reference of interest.
- *  \p y The second \p reference of interest.
+using universal_pointer = thrust::pointer<
+  T,
+  thrust::cuda_cub::tag,
+  typename std::add_lvalue_reference<T>::type
+>;
+
+/*! \p cuda::reference is a wrapped reference to an object stored in memory
+ *  accessible by the \p cuda system. \p cuda::reference is the type of the
+ *  result of dereferencing a \p cuda::pointer.
+ *
+ *  \tparam T Specifies the type of the referenced object.
+ *
+ *  \see cuda::pointer
  */
 template <typename T>
-__host__ __device__ void swap(reference<T> x, reference<T> y);
-
-} // end cuda_cub
-
-namespace system {
+using reference = thrust::tagged_reference<T, thrust::cuda_cub::tag>;
 
+} // namespace cuda_cub
 
 /*! \addtogroup system_backends Systems
  *  \ingroup system
@@ -291,31 +107,31 @@ namespace system {
  */
 
 /*! \namespace thrust::system::cuda
- *  \brief \p thrust::system::cuda is the namespace containing functionality for allocating, manipulating,
- *         and deallocating memory available to Thrust's CUDA backend system.
- *         The identifiers are provided in a separate namespace underneath <tt>thrust::system</tt>
- *         for import convenience but are also aliased in the top-level <tt>thrust::cuda</tt>
- *         namespace for easy access.
+ *  \brief \p thrust::system::cuda is the namespace containing functionality
+ *  for allocating, manipulating, and deallocating memory available to Thrust's
+ *  CUDA backend system. The identifiers are provided in a separate namespace
+ *  underneath \p thrust::system for import convenience but are also
+ *  aliased in the top-level <tt>thrust::cuda</tt> namespace for easy access.
  *
  */
-
-namespace cuda {
+namespace system { namespace cuda
+{
 using thrust::cuda_cub::pointer;
+using thrust::cuda_cub::universal_pointer;
 using thrust::cuda_cub::reference;
-} // end cuda
-
+}} // namespace system::cuda
 /*! \}
  */
 
-} // end system
-
 /*! \namespace thrust::cuda
- *  \brief \p thrust::cuda is a top-level alias for \p thrust::system::cuda. */
-namespace cuda {
+ *  \brief \p thrust::cuda is a top-level alias for \p thrust::system::cuda.
+ */
+namespace cuda
+{
 using thrust::cuda_cub::pointer;
+using thrust::cuda_cub::universal_pointer;
 using thrust::cuda_cub::reference;
-} // end cuda
+} // namespace cuda
 
-} // end thrust
+THRUST_NAMESPACE_END
 
-#include <thrust/system/cuda/detail/pointer.inl>
diff --git a/thrust/system/cuda/vector.h b/thrust/system/cuda/vector.h
index a02e98d77..fafc7bf17 100644
--- a/thrust/system/cuda/vector.h
+++ b/thrust/system/cuda/vector.h
@@ -2,7 +2,7 @@
  *  Copyright 2008-2013 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in ccudaliance with the License.
+ *  you may not use this file except in compliance with the License.
  *  You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
@@ -26,126 +26,64 @@
 #include <thrust/detail/vector_base.h>
 #include <vector>
 
-namespace thrust
-{
-
-// forward declaration of host_vector
-template<typename T, typename Allocator> class host_vector;
-
+THRUST_NAMESPACE_BEGIN
 namespace cuda_cub
 {
 
-// XXX upon c++11
-// template<typename T, typename Allocator = allocator<T> > using vector = thrust::detail::vector_base<T,Allocator>;
-
-/*! \p cuda_bulk::vector is a container that supports random access to elements,
+/*! \p cuda::vector is a container that supports random access to elements,
  *  constant time removal of elements at the end, and linear time insertion
  *  and removal of elements at the beginning or in the middle. The number of
- *  elements in a \p cuda_bulk::vector may vary dynamically; memory management is
- *  automatic. The elements contained in a \p cuda_bulk::vector reside in memory
- *  available to the \p cuda_bulk system.
+ *  elements in a \p cuda::vector may vary dynamically; memory management is
+ *  automatic. The elements contained in a \p cuda::vector reside in memory
+ *  accessible by the \p cuda system.
  *
- *  \tparam T The element type of the \p cuda_bulk::vector.
- *  \tparam Allocator The allocator type of the \p cuda_bulk::vector. Defaults to \p cuda_bulk::allocator.
+ *  \tparam T The element type of the \p cuda::vector.
+ *  \tparam Allocator The allocator type of the \p cuda::vector.
+ *          Defaults to \p cuda::allocator.
  *
- *  \see http://www.sgi.com/tech/stl/Vector.html
+ *  \see https://en.cppreference.com/w/cpp/container/vector
  *  \see host_vector For the documentation of the complete interface which is
- *                   shared by \p cuda_bulk::vector
+ *                   shared by \p cuda::vector
  *  \see device_vector
+ *  \see universal_vector
  */
-template<typename T, typename Allocator = allocator<T> >
-  class vector
-    : public thrust::detail::vector_base<T,Allocator>
-{
-  /*! \cond
-   */
-  private:
-    typedef thrust::detail::vector_base<T,Allocator> super_t;
-  /*! \endcond
-   */
-
-  public:
-
-  /*! \cond
-   */
-    typedef typename super_t::size_type  size_type;
-    typedef typename super_t::value_type value_type;
-  /*! \endcond
-   */
-
-    /*! This constructor creates an empty \p cuda_bulk::vector.
-     */
-    vector();
-
-    /*! This constructor creates a \p cuda_bulk::vector with \p n default-constructed elements.
-     *  \param n The size of the \p cuda_bulk::vector to create.
-     */
-    explicit vector(size_type n);
-
-    /*! This constructor creates a \p cuda_bulk::vector with \p n copies of \p value.
-     *  \param n The size of the \p cuda_bulk::vector to create.
-     *  \param value An element to copy.
-     */
-    explicit vector(size_type n, const value_type &value);
-
-    /*! Copy constructor copies from another \p cuda_bulk::vector.
-     *  \param x The other \p cuda_bulk::vector to copy.
-     */
-    vector(const vector &x);
-
-    /*! This constructor copies from another Thrust vector-like object.
-     *  \param x The other object to copy from.
-     */
-    template<typename OtherT, typename OtherAllocator>
-    vector(const thrust::detail::vector_base<OtherT,OtherAllocator> &x);
-
-    /*! This constructor copies from a \c std::vector.
-     *  \param x The \c std::vector to copy from.
-     */
-    template<typename OtherT, typename OtherAllocator>
-    vector(const std::vector<OtherT,OtherAllocator> &x);
-
-    /*! This constructor creates a \p cuda_bulk::vector by copying from a range.
-     *  \param first The beginning of the range.
-     *  \param last The end of the range.
-     */
-    template<typename InputIterator>
-    vector(InputIterator first, InputIterator last);
-
-    // XXX vector_base should take a Derived type so we don't have to define these superfluous assigns
-    //
-    /*! Assignment operator assigns from a \c std::vector.
-     *  \param x The \c std::vector to assign from.
-     *  \return <tt>*this</tt>
-     */
-    template<typename OtherT, typename OtherAllocator>
-    vector &operator=(const std::vector<OtherT,OtherAllocator> &x);
-
-    /*! Assignment operator assigns from another Thrust vector-like object.
-     *  \param x The other object to assign from.
-     *  \return <tt>*this</tt>
-     */
-    template<typename OtherT, typename OtherAllocator>
-    vector &operator=(const thrust::detail::vector_base<OtherT,OtherAllocator> &x);
-}; // end vector
+template <typename T, typename Allocator = thrust::system::cuda::allocator<T>>
+using vector = thrust::detail::vector_base<T, Allocator>;
+
+/*! \p cuda::universal_vector is a container that supports random access to
+ *  elements, constant time removal of elements at the end, and linear time
+ *  insertion and removal of elements at the beginning or in the middle. The
+ *  number of elements in a \p cuda::universal_vector may vary dynamically;
+ *  memory management is automatic. The elements contained in a
+ *  \p cuda::universal_vector reside in memory accessible by the \p cuda system
+ *  and host systems.
+ *
+ *  \tparam T The element type of the \p cuda::universal_vector.
+ *  \tparam Allocator The allocator type of the \p cuda::universal_vector.
+ *          Defaults to \p cuda::universal_allocator.
+ *
+ *  \see https://en.cppreference.com/w/cpp/container/vector
+ *  \see host_vector For the documentation of the complete interface which is
+ *                   shared by \p cuda::universal_vector
+ *  \see device_vector
+ *  \see universal_vector
+ */
+template <typename T, typename Allocator = thrust::system::cuda::universal_allocator<T>>
+using universal_vector = thrust::detail::vector_base<T, Allocator>;
 
-} // end cuda_cub
+} // namespace cuda_cub
 
-// alias system::cuda_bulk names at top-level
-namespace cuda
+namespace system { namespace cuda
 {
-
 using thrust::cuda_cub::vector;
+using thrust::cuda_cub::universal_vector;
+}}
 
-} // end cuda_bulk
-
-namespace system {
-namespace cuda {
+namespace cuda
+{
 using thrust::cuda_cub::vector;
+using thrust::cuda_cub::universal_vector;
 }
-}
-
-} // end thrust
 
-#include <thrust/system/cuda/detail/vector.inl>
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/adl/async/scan.h b/thrust/system/detail/adl/async/scan.h
new file mode 100644
index 000000000..a2a90618b
--- /dev/null
+++ b/thrust/system/detail/adl/async/scan.h
@@ -0,0 +1,34 @@
+/*
+ *  Copyright 2008-2020 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+// The purpose of this header is to #include the async/scan.h header of the
+// sequential, host, and device systems. It should be #included in any code
+// which uses ADL to dispatch async scans.
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+//#include <thrust/system/detail/sequential/async/scan.h>
+
+//#define __THRUST_HOST_SYSTEM_ASYNC_SCAN_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/async/scan.h>
+//#include __THRUST_HOST_SYSTEM_ASYNC_SCAN_HEADER
+//#undef __THRUST_HOST_SYSTEM_ASYNC_SCAN_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_ASYNC_SCAN_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/async/scan.h>
+#include __THRUST_DEVICE_SYSTEM_ASYNC_SCAN_HEADER
+#undef __THRUST_DEVICE_SYSTEM_ASYNC_SCAN_HEADER
+
diff --git a/thrust/system/detail/bad_alloc.h b/thrust/system/detail/bad_alloc.h
index 461704fd6..ae5dd5994 100644
--- a/thrust/system/detail/bad_alloc.h
+++ b/thrust/system/detail/bad_alloc.h
@@ -20,8 +20,9 @@
 #include <new>
 #include <string>
 
-namespace thrust
-{
+#include <thrust/detail/config.h>
+
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -40,9 +41,9 @@ class bad_alloc
       m_what += w;
     } // end bad_alloc()
 
-    inline virtual ~bad_alloc(void) throw () {};
+    inline virtual ~bad_alloc(void) noexcept {};
 
-    inline virtual const char *what(void) const throw()
+    inline virtual const char *what(void) const noexcept
     {
       return m_what.c_str();
     } // end what()
@@ -53,5 +54,5 @@ class bad_alloc
   
 } // end detail
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/errno.h b/thrust/system/detail/errno.h
index 78aec2ace..69cb2bd98 100644
--- a/thrust/system/detail/errno.h
+++ b/thrust/system/detail/errno.h
@@ -24,8 +24,7 @@
 // pollute the global namespace. These identifiers are in lowercase to avoid
 // colliding with the real macros in errno.h.
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace system
 {
@@ -116,5 +115,5 @@ static const int emlink          = 9979;
 
 } // end system
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/error_category.inl b/thrust/system/detail/error_category.inl
index 4602b0f30..45fd15a3f 100644
--- a/thrust/system/detail/error_category.inl
+++ b/thrust/system/detail/error_category.inl
@@ -17,13 +17,14 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/system/error_code.h>
 #include <thrust/system/detail/errno.h>
 #include <thrust/functional.h>
 #include <cstring>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace system
 {
@@ -232,5 +233,5 @@ const error_category &system_category(void)
 
 } // end system
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/error_code.inl b/thrust/system/detail/error_code.inl
index 6631f486f..2b819c048 100644
--- a/thrust/system/detail/error_code.inl
+++ b/thrust/system/detail/error_code.inl
@@ -17,10 +17,11 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/system/error_code.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace system
 {
@@ -193,5 +194,5 @@ bool operator!=(const error_condition &lhs, const error_condition &rhs)
 
 } // end system
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/error_condition.inl b/thrust/system/detail/error_condition.inl
index 9dc493bcc..0daf1f293 100644
--- a/thrust/system/detail/error_condition.inl
+++ b/thrust/system/detail/error_condition.inl
@@ -17,11 +17,12 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/system/detail/error_condition.inl>
 #include <thrust/functional.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace system
 {
@@ -129,5 +130,5 @@ bool operator<(const error_condition &lhs,
 
 } // end system
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/adjacent_difference.h b/thrust/system/detail/generic/adjacent_difference.h
index 6e4caaa88..43592e15b 100644
--- a/thrust/system/detail/generic/adjacent_difference.h
+++ b/thrust/system/detail/generic/adjacent_difference.h
@@ -24,8 +24,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/tag.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -52,7 +51,7 @@ OutputIterator adjacent_difference(thrust::execution_policy<DerivedPolicy> &exec
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/adjacent_difference.inl>
 
diff --git a/thrust/system/detail/generic/adjacent_difference.inl b/thrust/system/detail/generic/adjacent_difference.inl
index ad4ad1cd4..504129328 100644
--- a/thrust/system/detail/generic/adjacent_difference.inl
+++ b/thrust/system/detail/generic/adjacent_difference.inl
@@ -14,6 +14,8 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/adjacent_difference.h>
 #include <thrust/adjacent_difference.h>
@@ -22,8 +24,7 @@
 #include <thrust/detail/temporary_array.h>
 #include <thrust/transform.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -57,17 +58,17 @@ OutputIterator adjacent_difference(thrust::execution_policy<DerivedPolicy> &exec
   if(first == last)
   {
     // empty range, nothing to do
-    return result; 
+    return result;
   }
-  else 
+  else
   {
     // an in-place operation is requested, copy the input and call the entry point
     // XXX a special-purpose kernel would be faster here since
     // only block boundaries need to be copied
     thrust::detail::temporary_array<InputType, DerivedPolicy> input_copy(exec, first, last);
-    
+
     *result = *first;
-    thrust::transform(exec, input_copy.begin() + 1, input_copy.end(), input_copy.begin(), result + 1, binary_op); 
+    thrust::transform(exec, input_copy.begin() + 1, input_copy.end(), input_copy.begin(), result + 1, binary_op);
   }
 
   return result + (last - first);
@@ -77,5 +78,5 @@ OutputIterator adjacent_difference(thrust::execution_policy<DerivedPolicy> &exec
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/advance.h b/thrust/system/detail/generic/advance.h
index f9cab587b..4d6562e00 100644
--- a/thrust/system/detail/generic/advance.h
+++ b/thrust/system/detail/generic/advance.h
@@ -19,8 +19,7 @@
 
 #include <thrust/detail/config.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -35,7 +34,7 @@ void advance(InputIterator& i, Distance n);
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/advance.inl>
 
diff --git a/thrust/system/detail/generic/advance.inl b/thrust/system/detail/generic/advance.inl
index ae98d596b..21555ebb0 100644
--- a/thrust/system/detail/generic/advance.inl
+++ b/thrust/system/detail/generic/advance.inl
@@ -14,12 +14,13 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/advance.h>
 #include <thrust/iterator/iterator_traits.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -63,5 +64,5 @@ void advance(InputIterator& i, Distance n)
 } // end namespace detail
 } // end namespace generic
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/binary_search.h b/thrust/system/detail/generic/binary_search.h
index 8cd85c63f..6603f6c30 100644
--- a/thrust/system/detail/generic/binary_search.h
+++ b/thrust/system/detail/generic/binary_search.h
@@ -24,8 +24,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/tag.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -168,7 +167,7 @@ equal_range(thrust::execution_policy<DerivedPolicy> &exec,
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/binary_search.inl>
 
diff --git a/thrust/system/detail/generic/binary_search.inl b/thrust/system/detail/generic/binary_search.inl
index 143d8659f..bc60bb8e5 100644
--- a/thrust/system/detail/generic/binary_search.inl
+++ b/thrust/system/detail/generic/binary_search.inl
@@ -14,11 +14,6 @@
  *  limitations under the License.
  */
 
-
-/*! \file binary_search.inl
- *  \brief Inline file for binary_search.h
- */
-
 #pragma once
 
 #include <thrust/detail/config.h>
@@ -31,12 +26,12 @@
 #include <thrust/for_each.h>
 #include <thrust/detail/function.h>
 #include <thrust/system/detail/generic/scalar/binary_search.h>
+#include <thrust/system/detail/generic/select_system.h>
 
 #include <thrust/detail/temporary_array.h>
 #include <thrust/detail/type_traits.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 
@@ -88,9 +83,9 @@ struct bsf
   bool operator()(RandomAccessIterator begin, RandomAccessIterator end, const T& value, StrictWeakOrdering comp)
   {
     RandomAccessIterator iter = thrust::system::detail::generic::scalar::lower_bound(begin, end, value, comp);
-    
+
     thrust::detail::wrapped_function<StrictWeakOrdering,bool> wrapped_comp(comp);
-    
+
     return iter != end && !wrapped_comp(value, *iter);
   }
 };
@@ -103,11 +98,11 @@ struct binary_search_functor
   ForwardIterator end;
   StrictWeakOrdering comp;
   BinarySearchFunction func;
-  
+
   __host__ __device__
   binary_search_functor(ForwardIterator begin, ForwardIterator end, StrictWeakOrdering comp, BinarySearchFunction func)
     : begin(begin), end(end), comp(comp), func(func) {}
-  
+
   template<typename Tuple>
   __host__ __device__
   void operator()(Tuple t)
@@ -121,9 +116,9 @@ struct binary_search_functor
 template<typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator, typename StrictWeakOrdering, typename BinarySearchFunction>
 __host__ __device__
 OutputIterator binary_search(thrust::execution_policy<DerivedPolicy> &exec,
-                             ForwardIterator begin, 
+                             ForwardIterator begin,
                              ForwardIterator end,
-                             InputIterator values_begin, 
+                             InputIterator values_begin,
                              InputIterator values_end,
                              OutputIterator output,
                              StrictWeakOrdering comp,
@@ -133,11 +128,11 @@ OutputIterator binary_search(thrust::execution_policy<DerivedPolicy> &exec,
                    thrust::make_zip_iterator(thrust::make_tuple(values_begin, output)),
                    thrust::make_zip_iterator(thrust::make_tuple(values_end, output + thrust::distance(values_begin, values_end))),
                    detail::binary_search_functor<ForwardIterator, StrictWeakOrdering, BinarySearchFunction>(begin, end, comp, func));
-  
+
   return output + thrust::distance(values_begin, values_end);
 }
 
-   
+
 
 // Scalar Implementation
 template<typename OutputType, typename DerivedPolicy, typename ForwardIterator, typename T, typename StrictWeakOrdering, typename BinarySearchFunction>
@@ -145,24 +140,39 @@ __host__ __device__
 OutputType binary_search(thrust::execution_policy<DerivedPolicy> &exec,
                          ForwardIterator begin,
                          ForwardIterator end,
-                         const T& value, 
+                         const T& value,
                          StrictWeakOrdering comp,
                          BinarySearchFunction func)
 {
   // use the vectorized path to implement the scalar version
-  
+
   // allocate device buffers for value and output
   thrust::detail::temporary_array<T,DerivedPolicy>          d_value(exec,1);
   thrust::detail::temporary_array<OutputType,DerivedPolicy> d_output(exec,1);
-  
-  // copy value to device
-  d_value[0] = value;
-  
+
+  { // copy value to device
+    typedef typename thrust::iterator_system<const T*>::type value_in_system_t;
+    value_in_system_t value_in_system;
+    using thrust::system::detail::generic::select_system;
+    thrust::copy_n(select_system(thrust::detail::derived_cast(thrust::detail::strip_const(value_in_system)),
+                                 thrust::detail::derived_cast(thrust::detail::strip_const(exec))),
+                   &value, 1, d_value.begin());
+  }
+
   // perform the query
   thrust::system::detail::generic::detail::binary_search(exec, begin, end, d_value.begin(), d_value.end(), d_output.begin(), comp, func);
-  
-  // copy result to host and return
-  return d_output[0];
+
+  OutputType output;
+  { // copy result to host and return
+    typedef typename thrust::iterator_system<OutputType*>::type result_out_system_t;
+    result_out_system_t result_out_system;
+    using thrust::system::detail::generic::select_system;
+    thrust::copy_n(select_system(thrust::detail::derived_cast(thrust::detail::strip_const(exec)),
+                                 thrust::detail::derived_cast(thrust::detail::strip_const(result_out_system))),
+                   d_output.begin(), 1, &output);
+  }
+
+  return output;
 }
 
 
@@ -180,7 +190,7 @@ struct binary_search_less
   }
 };
 
-   
+
 } // end namespace detail
 
 
@@ -205,11 +215,11 @@ __host__ __device__
 ForwardIterator lower_bound(thrust::execution_policy<DerivedPolicy> &exec,
                             ForwardIterator begin,
                             ForwardIterator end,
-                            const T& value, 
+                            const T& value,
                             StrictWeakOrdering comp)
 {
   typedef typename thrust::iterator_traits<ForwardIterator>::difference_type difference_type;
-  
+
   return begin + detail::binary_search<difference_type>(exec, begin, end, value, comp, detail::lbf());
 }
 
@@ -231,11 +241,11 @@ __host__ __device__
 ForwardIterator upper_bound(thrust::execution_policy<DerivedPolicy> &exec,
                             ForwardIterator begin,
                             ForwardIterator end,
-                            const T& value, 
+                            const T& value,
                             StrictWeakOrdering comp)
 {
   typedef typename thrust::iterator_traits<ForwardIterator>::difference_type difference_type;
-  
+
   return begin + detail::binary_search<difference_type>(exec, begin, end, value, comp, detail::ubf());
 }
 
@@ -256,7 +266,7 @@ __host__ __device__
 bool binary_search(thrust::execution_policy<DerivedPolicy> &exec,
                    ForwardIterator begin,
                    ForwardIterator end,
-                   const T& value, 
+                   const T& value,
                    StrictWeakOrdering comp)
 {
   return detail::binary_search<bool>(exec, begin, end, value, comp, detail::bsf());
@@ -271,9 +281,9 @@ bool binary_search(thrust::execution_policy<DerivedPolicy> &exec,
 template<typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator>
 __host__ __device__
 OutputIterator lower_bound(thrust::execution_policy<DerivedPolicy> &exec,
-                           ForwardIterator begin, 
+                           ForwardIterator begin,
                            ForwardIterator end,
-                           InputIterator values_begin, 
+                           InputIterator values_begin,
                            InputIterator values_end,
                            OutputIterator output)
 {
@@ -285,9 +295,9 @@ OutputIterator lower_bound(thrust::execution_policy<DerivedPolicy> &exec,
 template<typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator, typename StrictWeakOrdering>
 __host__ __device__
 OutputIterator lower_bound(thrust::execution_policy<DerivedPolicy> &exec,
-                           ForwardIterator begin, 
+                           ForwardIterator begin,
                            ForwardIterator end,
-                           InputIterator values_begin, 
+                           InputIterator values_begin,
                            InputIterator values_end,
                            OutputIterator output,
                            StrictWeakOrdering comp)
@@ -299,9 +309,9 @@ OutputIterator lower_bound(thrust::execution_policy<DerivedPolicy> &exec,
 template<typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator>
 __host__ __device__
 OutputIterator upper_bound(thrust::execution_policy<DerivedPolicy> &exec,
-                           ForwardIterator begin, 
+                           ForwardIterator begin,
                            ForwardIterator end,
-                           InputIterator values_begin, 
+                           InputIterator values_begin,
                            InputIterator values_end,
                            OutputIterator output)
 {
@@ -313,9 +323,9 @@ OutputIterator upper_bound(thrust::execution_policy<DerivedPolicy> &exec,
 template<typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator, typename StrictWeakOrdering>
 __host__ __device__
 OutputIterator upper_bound(thrust::execution_policy<DerivedPolicy> &exec,
-                           ForwardIterator begin, 
+                           ForwardIterator begin,
                            ForwardIterator end,
-                           InputIterator values_begin, 
+                           InputIterator values_begin,
                            InputIterator values_end,
                            OutputIterator output,
                            StrictWeakOrdering comp)
@@ -327,9 +337,9 @@ OutputIterator upper_bound(thrust::execution_policy<DerivedPolicy> &exec,
 template<typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator>
 __host__ __device__
 OutputIterator binary_search(thrust::execution_policy<DerivedPolicy> &exec,
-                             ForwardIterator begin, 
+                             ForwardIterator begin,
                              ForwardIterator end,
-                             InputIterator values_begin, 
+                             InputIterator values_begin,
                              InputIterator values_end,
                              OutputIterator output)
 {
@@ -341,9 +351,9 @@ OutputIterator binary_search(thrust::execution_policy<DerivedPolicy> &exec,
 template<typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator, typename StrictWeakOrdering>
 __host__ __device__
 OutputIterator binary_search(thrust::execution_policy<DerivedPolicy> &exec,
-                             ForwardIterator begin, 
+                             ForwardIterator begin,
                              ForwardIterator end,
-                             InputIterator values_begin, 
+                             InputIterator values_begin,
                              InputIterator values_end,
                              OutputIterator output,
                              StrictWeakOrdering comp)
@@ -382,5 +392,5 @@ equal_range(thrust::execution_policy<DerivedPolicy> &exec,
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/copy.h b/thrust/system/detail/generic/copy.h
index e22535618..36ac71899 100644
--- a/thrust/system/detail/generic/copy.h
+++ b/thrust/system/detail/generic/copy.h
@@ -19,8 +19,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/tag.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -53,7 +52,7 @@ __host__ __device__
 } // end generic
 } // end detail
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/copy.inl>
 
diff --git a/thrust/system/detail/generic/copy.inl b/thrust/system/detail/generic/copy.inl
index 9763a0682..34d66baa6 100644
--- a/thrust/system/detail/generic/copy.inl
+++ b/thrust/system/detail/generic/copy.inl
@@ -26,8 +26,7 @@
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/iterator/detail/minimum_system.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -77,5 +76,5 @@ __host__ __device__
 } // end generic
 } // end detail
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/copy_if.h b/thrust/system/detail/generic/copy_if.h
index 6e3fb73a6..6a13edfda 100644
--- a/thrust/system/detail/generic/copy_if.h
+++ b/thrust/system/detail/generic/copy_if.h
@@ -19,8 +19,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/tag.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -58,7 +57,7 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/copy_if.inl>
 
diff --git a/thrust/system/detail/generic/copy_if.inl b/thrust/system/detail/generic/copy_if.inl
index 4bdafe382..5a6edd72e 100644
--- a/thrust/system/detail/generic/copy_if.inl
+++ b/thrust/system/detail/generic/copy_if.inl
@@ -32,8 +32,7 @@
 #include <thrust/scatter.h>
 #include <limits>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -157,5 +156,5 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/count.h b/thrust/system/detail/generic/count.h
index 218369e38..295d36e6b 100644
--- a/thrust/system/detail/generic/count.h
+++ b/thrust/system/detail/generic/count.h
@@ -20,8 +20,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/tag.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -45,7 +44,7 @@ count_if(thrust::execution_policy<DerivedPolicy> &exec, InputIterator first, Inp
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/count.inl>
 
diff --git a/thrust/system/detail/generic/count.inl b/thrust/system/detail/generic/count.inl
index d9e1039e8..dafc1c1df 100644
--- a/thrust/system/detail/generic/count.inl
+++ b/thrust/system/detail/generic/count.inl
@@ -14,13 +14,14 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/count.h>
 #include <thrust/transform_reduce.h>
 #include <thrust/detail/internal_functional.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -32,7 +33,7 @@ namespace generic
 template <typename InputType, typename Predicate, typename CountType>
 struct count_if_transform
 {
-  __host__ __device__ 
+  __host__ __device__
   count_if_transform(Predicate _pred) : pred(_pred){}
 
   __thrust_exec_check_disable__
@@ -54,8 +55,9 @@ __host__ __device__
 typename thrust::iterator_traits<InputIterator>::difference_type
 count(thrust::execution_policy<DerivedPolicy> &exec, InputIterator first, InputIterator last, const EqualityComparable& value)
 {
-  // XXX use placeholder expression here
-  return thrust::count_if(exec, first, last, thrust::detail::equal_to_value<EqualityComparable>(value));
+  using thrust::placeholders::_1;
+
+  return thrust::count_if(exec, first, last, _1 == value);
 } // end count()
 
 
@@ -66,7 +68,7 @@ count_if(thrust::execution_policy<DerivedPolicy> &exec, InputIterator first, Inp
 {
   typedef typename thrust::iterator_traits<InputIterator>::value_type InputType;
   typedef typename thrust::iterator_traits<InputIterator>::difference_type CountType;
-  
+
   thrust::system::detail::generic::count_if_transform<InputType, Predicate, CountType> unary_op(pred);
   thrust::plus<CountType> binary_op;
   return thrust::transform_reduce(exec, first, last, unary_op, CountType(0), binary_op);
@@ -76,5 +78,5 @@ count_if(thrust::execution_policy<DerivedPolicy> &exec, InputIterator first, Inp
 } // end generic
 } // end detail
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/distance.h b/thrust/system/detail/generic/distance.h
index 03b0fb556..4627376b5 100644
--- a/thrust/system/detail/generic/distance.h
+++ b/thrust/system/detail/generic/distance.h
@@ -20,8 +20,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/iterator/iterator_traits.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -37,7 +36,7 @@ inline __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/distance.inl>
 
diff --git a/thrust/system/detail/generic/distance.inl b/thrust/system/detail/generic/distance.inl
index 5cc697200..46bad7ba7 100644
--- a/thrust/system/detail/generic/distance.inl
+++ b/thrust/system/detail/generic/distance.inl
@@ -14,12 +14,13 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/distance.h>
 #include <thrust/iterator/iterator_traits.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -60,7 +61,7 @@ inline __host__ __device__
 
 } // end detail
 
-
+__thrust_exec_check_disable__
 template<typename InputIterator>
 inline __host__ __device__
   typename thrust::iterator_traits<InputIterator>::difference_type
@@ -75,5 +76,5 @@ inline __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/equal.h b/thrust/system/detail/generic/equal.h
index 8962b1bd1..4afd88d00 100644
--- a/thrust/system/detail/generic/equal.h
+++ b/thrust/system/detail/generic/equal.h
@@ -19,8 +19,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/tag.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -42,7 +41,7 @@ bool equal(thrust::execution_policy<DerivedPolicy> &exec, InputIterator1 first1,
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/equal.inl>
 
diff --git a/thrust/system/detail/generic/equal.inl b/thrust/system/detail/generic/equal.inl
index 7c9dec4bc..c023070cd 100644
--- a/thrust/system/detail/generic/equal.inl
+++ b/thrust/system/detail/generic/equal.inl
@@ -14,14 +14,15 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/equal.h>
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/detail/internal_functional.h>
 #include <thrust/mismatch.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -35,7 +36,7 @@ __host__ __device__
 bool equal(thrust::execution_policy<DerivedPolicy> &exec, InputIterator1 first1, InputIterator1 last1, InputIterator2 first2)
 {
   typedef typename thrust::iterator_traits<InputIterator1>::value_type InputType1;
-  
+
   return thrust::equal(exec, first1, last1, first2, thrust::detail::equal_to<InputType1>());
 }
 
@@ -54,5 +55,5 @@ bool equal(thrust::execution_policy<DerivedPolicy> &exec, InputIterator1 first1,
 } // end generic
 } // end detail
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/extrema.h b/thrust/system/detail/generic/extrema.h
index a3ee81889..e3b447958 100644
--- a/thrust/system/detail/generic/extrema.h
+++ b/thrust/system/detail/generic/extrema.h
@@ -25,8 +25,7 @@
 #include <thrust/pair.h>
 #include <thrust/system/detail/generic/tag.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -83,7 +82,7 @@ thrust::pair<ForwardIterator,ForwardIterator> minmax_element(thrust::execution_p
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/extrema.inl>
 
diff --git a/thrust/system/detail/generic/extrema.inl b/thrust/system/detail/generic/extrema.inl
index 22183db9a..744d137de 100644
--- a/thrust/system/detail/generic/extrema.inl
+++ b/thrust/system/detail/generic/extrema.inl
@@ -33,8 +33,7 @@
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/zip_iterator.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -259,5 +258,5 @@ thrust::pair<ForwardIterator,ForwardIterator> minmax_element(thrust::execution_p
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/fill.h b/thrust/system/detail/generic/fill.h
index 6c4f2ed4e..5a881359b 100644
--- a/thrust/system/detail/generic/fill.h
+++ b/thrust/system/detail/generic/fill.h
@@ -16,12 +16,13 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/detail/internal_functional.h>
 #include <thrust/generate.h>
 #include <thrust/system/detail/generic/tag.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -56,5 +57,5 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/find.h b/thrust/system/detail/generic/find.h
index 00e11e53c..6db441d02 100644
--- a/thrust/system/detail/generic/find.h
+++ b/thrust/system/detail/generic/find.h
@@ -20,8 +20,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/tag.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -57,7 +56,7 @@ InputIterator find_if_not(thrust::execution_policy<DerivedPolicy> &exec,
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/find.inl>
 
diff --git a/thrust/system/detail/generic/find.inl b/thrust/system/detail/generic/find.inl
index 9414fc615..8bd619561 100644
--- a/thrust/system/detail/generic/find.inl
+++ b/thrust/system/detail/generic/find.inl
@@ -14,6 +14,8 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/find.h>
 #include <thrust/reduce.h>
@@ -28,8 +30,7 @@
 
 // Contributed by Erich Elsen
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -45,8 +46,9 @@ InputIterator find(thrust::execution_policy<DerivedPolicy> &exec,
                    InputIterator last,
                    const T& value)
 {
-  // XXX consider a placeholder expression here
-  return thrust::find_if(exec, first, last, thrust::detail::equal_to_value<T>(value));
+  using thrust::placeholders::_1;
+
+  return thrust::find_if(exec, first, last, _1 == value);
 } // end find()
 
 
@@ -71,7 +73,7 @@ struct find_if_functor
     }
   }
 };
-    
+
 
 template<typename DerivedPolicy, typename InputIterator, typename Predicate>
 __host__ __device__
@@ -82,30 +84,30 @@ InputIterator find_if(thrust::execution_policy<DerivedPolicy> &exec,
 {
   typedef typename thrust::iterator_traits<InputIterator>::difference_type difference_type;
   typedef typename thrust::tuple<bool,difference_type> result_type;
-  
+
   // empty sequence
   if(first == last) return last;
-  
+
   const difference_type n = thrust::distance(first, last);
-  
+
   // this implementation breaks up the sequence into separate intervals
   // in an attempt to early-out as soon as a value is found
-  
+
   // TODO incorporate sizeof(InputType) into interval_threshold and round to multiple of 32
   const difference_type interval_threshold = 1 << 20;
   const difference_type interval_size = (thrust::min)(interval_threshold, n);
-  
+
   // force transform_iterator output to bool
   typedef thrust::transform_iterator<Predicate, InputIterator, bool> XfrmIterator;
   typedef thrust::tuple<XfrmIterator, thrust::counting_iterator<difference_type> > IteratorTuple;
   typedef thrust::zip_iterator<IteratorTuple> ZipIterator;
-  
+
   IteratorTuple iter_tuple = thrust::make_tuple(XfrmIterator(first, pred),
                                                 thrust::counting_iterator<difference_type>(0));
-  
+
   ZipIterator begin = thrust::make_zip_iterator(iter_tuple);
   ZipIterator end   = begin + n;
-  
+
   for(ZipIterator interval_begin = begin; interval_begin < end; interval_begin += interval_size)
   {
     ZipIterator interval_end = interval_begin + interval_size;
@@ -113,19 +115,19 @@ InputIterator find_if(thrust::execution_policy<DerivedPolicy> &exec,
     {
       interval_end = end;
     } // end if
-    
+
     result_type result = thrust::reduce(exec,
                                         interval_begin, interval_end,
                                         result_type(false,interval_end - begin),
                                         find_if_functor<result_type>());
-    
+
     // see if we found something
     if(thrust::get<0>(result))
     {
       return first + thrust::get<1>(result);
     }
   }
-  
+
   //nothing was found if we reach here...
   return first + n;
 }
@@ -145,5 +147,5 @@ InputIterator find_if_not(thrust::execution_policy<DerivedPolicy> &exec,
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/for_each.h b/thrust/system/detail/generic/for_each.h
index c4add4305..3c2ec12cd 100644
--- a/thrust/system/detail/generic/for_each.h
+++ b/thrust/system/detail/generic/for_each.h
@@ -26,8 +26,7 @@
 #include <thrust/system/detail/generic/tag.h>
 #include <thrust/detail/static_assert.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -74,5 +73,5 @@ InputIterator for_each_n(thrust::execution_policy<DerivedPolicy> &,
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/gather.h b/thrust/system/detail/generic/gather.h
index d587572f0..5b6b41831 100644
--- a/thrust/system/detail/generic/gather.h
+++ b/thrust/system/detail/generic/gather.h
@@ -20,8 +20,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/tag.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -75,7 +74,7 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/gather.inl>
 
diff --git a/thrust/system/detail/generic/gather.inl b/thrust/system/detail/generic/gather.inl
index 4f4289ecb..7ab550edf 100644
--- a/thrust/system/detail/generic/gather.inl
+++ b/thrust/system/detail/generic/gather.inl
@@ -14,6 +14,8 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/gather.h>
 #include <thrust/iterator/iterator_traits.h>
@@ -21,8 +23,7 @@
 #include <thrust/transform.h>
 #include <thrust/iterator/permutation_iterator.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -103,5 +104,5 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/generate.h b/thrust/system/detail/generic/generate.h
index edc2cc5eb..a9846c5be 100644
--- a/thrust/system/detail/generic/generate.h
+++ b/thrust/system/detail/generic/generate.h
@@ -20,8 +20,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/tag.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -51,7 +50,7 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/generate.inl>
 
diff --git a/thrust/system/detail/generic/generate.inl b/thrust/system/detail/generic/generate.inl
index 9ca319b99..869e0f32b 100644
--- a/thrust/system/detail/generic/generate.inl
+++ b/thrust/system/detail/generic/generate.inl
@@ -14,14 +14,15 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/generate.h>
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/detail/internal_functional.h>
 #include <thrust/for_each.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -95,5 +96,5 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/inner_product.h b/thrust/system/detail/generic/inner_product.h
index 71e1a9270..62d10d31f 100644
--- a/thrust/system/detail/generic/inner_product.h
+++ b/thrust/system/detail/generic/inner_product.h
@@ -20,8 +20,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/tag.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -53,7 +52,7 @@ OutputType inner_product(thrust::execution_policy<DerivedPolicy> &exec,
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/inner_product.inl>
 
diff --git a/thrust/system/detail/generic/inner_product.inl b/thrust/system/detail/generic/inner_product.inl
index 0a50386be..5055ec10f 100644
--- a/thrust/system/detail/generic/inner_product.inl
+++ b/thrust/system/detail/generic/inner_product.inl
@@ -14,14 +14,15 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/inner_product.h>
 #include <thrust/functional.h>
 #include <thrust/detail/internal_functional.h>
 #include <thrust/transform_reduce.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -50,7 +51,7 @@ OutputType inner_product(thrust::execution_policy<DerivedPolicy> &exec,
                          InputIterator1 first1,
                          InputIterator1 last1,
                          InputIterator2 first2,
-                         OutputType init, 
+                         OutputType init,
                          BinaryFunction1 binary_op1,
                          BinaryFunction2 binary_op2)
 {
@@ -68,5 +69,5 @@ OutputType inner_product(thrust::execution_policy<DerivedPolicy> &exec,
 } // end generic
 } // end detail
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/logical.h b/thrust/system/detail/generic/logical.h
index 702dbad85..e261154e2 100644
--- a/thrust/system/detail/generic/logical.h
+++ b/thrust/system/detail/generic/logical.h
@@ -22,8 +22,7 @@
 #include <thrust/find.h>
 #include <thrust/logical.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -59,5 +58,5 @@ bool none_of(thrust::execution_policy<ExecutionPolicy> &exec, InputIterator firs
 } // end generic
 } // end detail
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/memory.h b/thrust/system/detail/generic/memory.h
index 344b3673d..675cc7302 100644
--- a/thrust/system/detail/generic/memory.h
+++ b/thrust/system/detail/generic/memory.h
@@ -29,8 +29,7 @@
 #include <thrust/detail/pointer.h>
 #include <thrust/pair.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -65,7 +64,7 @@ void iter_swap(thrust::execution_policy<DerivedPolicy>&, Pointer1, Pointer2);
 } // end generic
 } // end detail
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/memory.inl>
 
diff --git a/thrust/system/detail/generic/memory.inl b/thrust/system/detail/generic/memory.inl
index eadf39ae9..b85729098 100644
--- a/thrust/system/detail/generic/memory.inl
+++ b/thrust/system/detail/generic/memory.inl
@@ -14,6 +14,8 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/detail/type_traits/pointer_traits.h>
 #include <thrust/system/detail/generic/memory.h>
@@ -21,8 +23,7 @@
 #include <thrust/detail/static_assert.h>
 #include <thrust/detail/malloc_and_free.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -100,5 +101,5 @@ void iter_swap(thrust::execution_policy<DerivedPolicy> &, Pointer1, Pointer2)
 } // end generic
 } // end detail
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/merge.h b/thrust/system/detail/generic/merge.h
index d80906e3d..6e8246407 100644
--- a/thrust/system/detail/generic/merge.h
+++ b/thrust/system/detail/generic/merge.h
@@ -20,8 +20,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/tag.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -85,7 +84,7 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/merge.inl>
 
diff --git a/thrust/system/detail/generic/merge.inl b/thrust/system/detail/generic/merge.inl
index 2938e8c92..03b77e623 100644
--- a/thrust/system/detail/generic/merge.inl
+++ b/thrust/system/detail/generic/merge.inl
@@ -25,8 +25,7 @@
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/detail/internal_functional.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -127,5 +126,5 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/mismatch.h b/thrust/system/detail/generic/mismatch.h
index 50e9f678b..4a71cd344 100644
--- a/thrust/system/detail/generic/mismatch.h
+++ b/thrust/system/detail/generic/mismatch.h
@@ -20,8 +20,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/tag.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -52,7 +51,7 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/mismatch.inl>
 
diff --git a/thrust/system/detail/generic/mismatch.inl b/thrust/system/detail/generic/mismatch.inl
index d879a6e11..f6b9674a1 100644
--- a/thrust/system/detail/generic/mismatch.inl
+++ b/thrust/system/detail/generic/mismatch.inl
@@ -14,14 +14,15 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/mismatch.h>
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/detail/internal_functional.h>
 #include <thrust/find.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -38,10 +39,9 @@ __host__ __device__
              InputIterator1 last1,
              InputIterator2 first2)
 {
-  typedef typename thrust::iterator_value<InputIterator1>::type InputType1;
-  
-  // XXX use a placeholder expression here
-  return thrust::mismatch(exec, first1, last1, first2, thrust::detail::equal_to<InputType1>());
+  using namespace thrust::placeholders;
+
+  return thrust::mismatch(exec, first1, last1, first2, _1 == _2);
 } // end mismatch()
 
 
@@ -57,12 +57,12 @@ __host__ __device__
   // Contributed by Erich Elsen
   typedef thrust::tuple<InputIterator1,InputIterator2> IteratorTuple;
   typedef thrust::zip_iterator<IteratorTuple>          ZipIterator;
-  
+
   ZipIterator zipped_first = thrust::make_zip_iterator(thrust::make_tuple(first1,first2));
   ZipIterator zipped_last  = thrust::make_zip_iterator(thrust::make_tuple(last1, first2));
-  
+
   ZipIterator result = thrust::find_if_not(exec, zipped_first, zipped_last, thrust::detail::tuple_binary_predicate<BinaryPredicate>(pred));
-  
+
   return thrust::make_pair(thrust::get<0>(result.get_iterator_tuple()),
                            thrust::get<1>(result.get_iterator_tuple()));
 } // end mismatch()
@@ -71,5 +71,5 @@ __host__ __device__
 } // end generic
 } // end detail
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/partition.h b/thrust/system/detail/generic/partition.h
index fdd158c4c..113d6ecbc 100644
--- a/thrust/system/detail/generic/partition.h
+++ b/thrust/system/detail/generic/partition.h
@@ -24,8 +24,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/tag.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -164,7 +163,7 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/partition.inl>
 
diff --git a/thrust/system/detail/generic/partition.inl b/thrust/system/detail/generic/partition.inl
index 73a8a286e..ab56fdd57 100644
--- a/thrust/system/detail/generic/partition.inl
+++ b/thrust/system/detail/generic/partition.inl
@@ -14,6 +14,8 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/partition.h>
 #include <thrust/iterator/iterator_traits.h>
@@ -29,8 +31,7 @@
 #include <thrust/detail/internal_functional.h>
 #include <thrust/detail/temporary_array.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -244,5 +245,5 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/per_device_resource.h b/thrust/system/detail/generic/per_device_resource.h
index 9378940f3..606f91f36 100644
--- a/thrust/system/detail/generic/per_device_resource.h
+++ b/thrust/system/detail/generic/per_device_resource.h
@@ -22,8 +22,7 @@
 #include <thrust/mr/memory_resource.h>
 #include <thrust/detail/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -43,5 +42,5 @@ MR * get_per_device_resource(thrust::detail::execution_policy_base<DerivedPolicy
 } // end generic
 } // end detail
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/reduce.h b/thrust/system/detail/generic/reduce.h
index c3e7af0d2..f28b11a87 100644
--- a/thrust/system/detail/generic/reduce.h
+++ b/thrust/system/detail/generic/reduce.h
@@ -21,8 +21,7 @@
 #include <thrust/system/detail/generic/tag.h>
 #include <thrust/iterator/iterator_traits.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -53,7 +52,7 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/reduce.inl>
 
diff --git a/thrust/system/detail/generic/reduce.inl b/thrust/system/detail/generic/reduce.inl
index b866e86dc..d673d0cf8 100644
--- a/thrust/system/detail/generic/reduce.inl
+++ b/thrust/system/detail/generic/reduce.inl
@@ -16,14 +16,15 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/reduce.h>
 #include <thrust/system/detail/generic/reduce.h>
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/functional.h>
 #include <thrust/detail/static_assert.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -75,5 +76,5 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/reduce_by_key.h b/thrust/system/detail/generic/reduce_by_key.h
index aaa5959a4..8ba47e11f 100644
--- a/thrust/system/detail/generic/reduce_by_key.h
+++ b/thrust/system/detail/generic/reduce_by_key.h
@@ -21,8 +21,7 @@
 #include <thrust/system/detail/generic/tag.h>
 #include <thrust/iterator/iterator_traits.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -83,7 +82,7 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/reduce_by_key.inl>
 
diff --git a/thrust/system/detail/generic/reduce_by_key.inl b/thrust/system/detail/generic/reduce_by_key.inl
index 41c2106b0..2ea73feda 100644
--- a/thrust/system/detail/generic/reduce_by_key.inl
+++ b/thrust/system/detail/generic/reduce_by_key.inl
@@ -14,13 +14,10 @@
  *  limitations under the License.
  */
 
-
-/*! \file reduce_by_key.inl
- *  \brief Inline file for reduce_by_key.h.
- */
-
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/iterator/detail/minimum_system.h>
 #include <thrust/detail/type_traits.h>
@@ -35,8 +32,7 @@
 #include <thrust/scan.h>
 #include <thrust/detail/temporary_array.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -51,12 +47,12 @@ template <typename ValueType, typename TailFlagType, typename AssociativeOperato
 struct reduce_by_key_functor
 {
   AssociativeOperator binary_op;
-  
+
   typedef typename thrust::tuple<ValueType, TailFlagType> result_type;
-  
+
   __host__ __device__
   reduce_by_key_functor(AssociativeOperator _binary_op) : binary_op(_binary_op) {}
-  
+
   __host__ __device__
   result_type operator()(result_type a, result_type b)
   {
@@ -79,7 +75,7 @@ template<typename ExecutionPolicy,
 __host__ __device__
   thrust::pair<OutputIterator1,OutputIterator2>
     reduce_by_key(thrust::execution_policy<ExecutionPolicy> &exec,
-                  InputIterator1 keys_first, 
+                  InputIterator1 keys_first,
                   InputIterator1 keys_last,
                   InputIterator2 values_first,
                   OutputIterator1 keys_output,
@@ -91,27 +87,8 @@ __host__ __device__
 
     typedef unsigned int FlagType;  // TODO use difference_type
 
-    // the pseudocode for deducing the type of the temporary used below:
-    // 
-    // if BinaryFunction is AdaptableBinaryFunction
-    //   TemporaryType = AdaptableBinaryFunction::result_type
-    // else if OutputIterator2 is a "pure" output iterator
-    //   TemporaryType = InputIterator2::value_type
-    // else
-    //   TemporaryType = OutputIterator2::value_type
-    //
-    // XXX upon c++0x, TemporaryType needs to be:
-    // result_of_adaptable_function<BinaryFunction>::type
-
-    typedef typename thrust::detail::eval_if<
-      thrust::detail::has_result_type<BinaryFunction>::value,
-      thrust::detail::result_type<BinaryFunction>,
-      thrust::detail::eval_if<
-        thrust::detail::is_output_iterator<OutputIterator2>::value,
-        thrust::iterator_value<InputIterator2>,
-        thrust::iterator_value<OutputIterator2>
-      >
-    >::type ValueType;
+    // Use the input iterator's value type per https://wg21.link/P0571
+    using ValueType = typename thrust::iterator_value<InputIterator2>::type;
 
     if (keys_first == keys_last)
         return thrust::make_pair(keys_output, values_output);
@@ -120,7 +97,7 @@ __host__ __device__
     difference_type n = keys_last - keys_first;
 
     InputIterator2 values_last = values_first + n;
-    
+
     // compute head flags
     thrust::detail::temporary_array<FlagType,ExecutionPolicy> head_flags(exec, n);
     thrust::transform(exec, keys_first, keys_last - 1, keys_first + 1, head_flags.begin() + 1, thrust::detail::not2(binary_pred));
@@ -134,7 +111,7 @@ __host__ __device__
     // scan the values by flag
     thrust::detail::temporary_array<ValueType,ExecutionPolicy> scanned_values(exec, n);
     thrust::detail::temporary_array<FlagType,ExecutionPolicy>  scanned_tail_flags(exec, n);
-    
+
     thrust::inclusive_scan
         (exec,
          thrust::make_zip_iterator(thrust::make_tuple(values_first,           head_flags.begin())),
@@ -146,12 +123,12 @@ __host__ __device__
 
     // number of unique keys
     FlagType N = scanned_tail_flags[n - 1] + 1;
-    
-    // scatter the keys and accumulated values    
+
+    // scatter the keys and accumulated values
     thrust::scatter_if(exec, keys_first,            keys_last,             scanned_tail_flags.begin(), head_flags.begin(), keys_output);
     thrust::scatter_if(exec, scanned_values.begin(), scanned_values.end(), scanned_tail_flags.begin(), tail_flags.begin(), values_output);
 
-    return thrust::make_pair(keys_output + N, values_output + N); 
+    return thrust::make_pair(keys_output + N, values_output + N);
 } // end reduce_by_key()
 
 
@@ -163,7 +140,7 @@ template<typename ExecutionPolicy,
 __host__ __device__
   thrust::pair<OutputIterator1,OutputIterator2>
     reduce_by_key(thrust::execution_policy<ExecutionPolicy> &exec,
-                  InputIterator1 keys_first, 
+                  InputIterator1 keys_first,
                   InputIterator1 keys_last,
                   InputIterator2 values_first,
                   OutputIterator1 keys_output,
@@ -185,7 +162,7 @@ template<typename ExecutionPolicy,
 __host__ __device__
   thrust::pair<OutputIterator1,OutputIterator2>
     reduce_by_key(thrust::execution_policy<ExecutionPolicy> &exec,
-                  InputIterator1 keys_first, 
+                  InputIterator1 keys_first,
                   InputIterator1 keys_last,
                   InputIterator2 values_first,
                   OutputIterator1 keys_output,
@@ -200,7 +177,7 @@ __host__ __device__
 
   // use plus<T> as default BinaryFunction
   return thrust::reduce_by_key(exec,
-                               keys_first, keys_last, 
+                               keys_first, keys_last,
                                values_first,
                                keys_output,
                                values_output,
@@ -212,5 +189,5 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/remove.h b/thrust/system/detail/generic/remove.h
index 343f643e9..37354ef80 100644
--- a/thrust/system/detail/generic/remove.h
+++ b/thrust/system/detail/generic/remove.h
@@ -24,8 +24,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/tag.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -107,7 +106,7 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/remove.inl>
 
diff --git a/thrust/system/detail/generic/remove.inl b/thrust/system/detail/generic/remove.inl
index 6cb5a694b..e51a3caee 100644
--- a/thrust/system/detail/generic/remove.inl
+++ b/thrust/system/detail/generic/remove.inl
@@ -14,10 +14,7 @@
  *  limitations under the License.
  */
 
-
-/*! \file remove.inl
- *  \brief Inline file for remove.h
- */
+#pragma once
 
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/remove.h>
@@ -27,8 +24,7 @@
 #include <thrust/detail/temporary_array.h>
 #include <thrust/remove.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -108,7 +104,7 @@ __host__ __device__
 
   // remove into temp
   return thrust::remove_copy_if(exec, temp.begin(), temp.end(), stencil, first, pred);
-} // end remove_if() 
+} // end remove_if()
 
 
 template<typename DerivedPolicy,
@@ -146,5 +142,5 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/replace.h b/thrust/system/detail/generic/replace.h
index 6167f711a..0821d6c07 100644
--- a/thrust/system/detail/generic/replace.h
+++ b/thrust/system/detail/generic/replace.h
@@ -20,8 +20,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/tag.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -92,7 +91,7 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/replace.inl>
 
diff --git a/thrust/system/detail/generic/replace.inl b/thrust/system/detail/generic/replace.inl
index d5b6caa63..ed845dd45 100644
--- a/thrust/system/detail/generic/replace.inl
+++ b/thrust/system/detail/generic/replace.inl
@@ -14,14 +14,15 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
+#include <thrust/functional.h>
 #include <thrust/system/detail/generic/replace.h>
 #include <thrust/transform.h>
 #include <thrust/replace.h>
-#include <thrust/detail/internal_functional.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -55,7 +56,7 @@ template<typename Predicate, typename NewType, typename OutputType>
   {
     return pred(y) ? new_value : x;
   } // end operator()()
-  
+
   Predicate pred;
   NewType new_value;
 }; // end new_value_if
@@ -124,8 +125,9 @@ __host__ __device__
                               const T &old_value,
                               const T &new_value)
 {
-  thrust::detail::equal_to_value<T> pred(old_value);
-  return thrust::replace_copy_if(exec, first, last, result, pred, new_value);
+  using thrust::placeholders::_1;
+
+  return thrust::replace_copy_if(exec, first, last, result, _1 == old_value, new_value);
 } // end replace_copy()
 
 
@@ -164,13 +166,14 @@ __host__ __device__
                const T &old_value,
                const T &new_value)
 {
-  thrust::detail::equal_to_value<T> pred(old_value);
-  return thrust::replace_if(exec, first, last, pred, new_value);
+  using thrust::placeholders::_1;
+
+  return thrust::replace_if(exec, first, last, _1 == old_value, new_value);
 } // end replace()
 
 
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/reverse.h b/thrust/system/detail/generic/reverse.h
index 11421d41b..65c77ae75 100644
--- a/thrust/system/detail/generic/reverse.h
+++ b/thrust/system/detail/generic/reverse.h
@@ -20,8 +20,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/tag.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -50,7 +49,7 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/reverse.inl>
 
diff --git a/thrust/system/detail/generic/reverse.inl b/thrust/system/detail/generic/reverse.inl
index b77c75b6f..1ce6db38b 100644
--- a/thrust/system/detail/generic/reverse.inl
+++ b/thrust/system/detail/generic/reverse.inl
@@ -14,6 +14,8 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/reverse.h>
 #include <thrust/advance.h>
@@ -23,8 +25,7 @@
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/iterator/reverse_iterator.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -70,6 +71,6 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 
diff --git a/thrust/system/detail/generic/scalar/binary_search.h b/thrust/system/detail/generic/scalar/binary_search.h
index 373b59a60..3e019c223 100644
--- a/thrust/system/detail/generic/scalar/binary_search.h
+++ b/thrust/system/detail/generic/scalar/binary_search.h
@@ -19,8 +19,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/pair.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace system
 {
@@ -79,7 +78,7 @@ bool binary_search(RandomAccessIterator first, RandomAccessIterator last, const
 
 } // end system
 
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/scalar/binary_search.inl>
 
diff --git a/thrust/system/detail/generic/scalar/binary_search.inl b/thrust/system/detail/generic/scalar/binary_search.inl
index 06a240f1e..61c71fba4 100644
--- a/thrust/system/detail/generic/scalar/binary_search.inl
+++ b/thrust/system/detail/generic/scalar/binary_search.inl
@@ -21,8 +21,7 @@
 #include <thrust/detail/function.h>
 #include <thrust/iterator/iterator_traits.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace system
 {
@@ -52,7 +51,7 @@ RandomAccessIterator lower_bound_n(RandomAccessIterator first,
   Size start = 0, i;
   while(start < n)
   {
-    i = (start + n) / 2;
+    i = start + (n - start) / 2;  // Overflow-safe variant of (a+b)/2
     if(wrapped_comp(first[i], val))
     {
       start = i + 1;
@@ -62,7 +61,7 @@ RandomAccessIterator lower_bound_n(RandomAccessIterator first,
       n = i;
     }
   } // end while
-  
+
   return first + start;
 }
 
@@ -94,7 +93,7 @@ RandomAccessIterator upper_bound_n(RandomAccessIterator first,
   Size start = 0, i;
   while(start < n)
   {
-    i = (start + n) / 2;
+    i = start + (n - start) / 2;  // Overflow-safe variant of (a+b)/2
     if(wrapped_comp(val, first[i]))
     {
       n = i;
@@ -104,7 +103,7 @@ RandomAccessIterator upper_bound_n(RandomAccessIterator first,
       start = i + 1;
     }
   } // end while
-  
+
   return first + start;
 }
 
@@ -153,7 +152,6 @@ bool binary_search(RandomAccessIterator first, RandomAccessIterator last, const
 
 } // end system
 
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/scalar/binary_search.inl>
-
diff --git a/thrust/system/detail/generic/scan.h b/thrust/system/detail/generic/scan.h
index c32b0f2b9..476441ab6 100644
--- a/thrust/system/detail/generic/scan.h
+++ b/thrust/system/detail/generic/scan.h
@@ -20,8 +20,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/tag.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -93,7 +92,7 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/scan.inl>
 
diff --git a/thrust/system/detail/generic/scan.inl b/thrust/system/detail/generic/scan.inl
index 675d8f986..45a2aadd0 100644
--- a/thrust/system/detail/generic/scan.inl
+++ b/thrust/system/detail/generic/scan.inl
@@ -26,8 +26,7 @@
 #include <thrust/detail/type_traits/iterator/is_output_iterator.h>
 #include <thrust/functional.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -45,21 +44,8 @@ __host__ __device__
                                 InputIterator last,
                                 OutputIterator result)
 {
-  // the pseudocode for deducing the type of the temporary used below:
-  // 
-  // if OutputIterator is a "pure" output iterator
-  //   TemporaryType = InputIterator::value_type
-  // else
-  //   TemporaryType = OutputIterator::value_type
-
-  typedef typename thrust::detail::eval_if<
-      thrust::detail::is_output_iterator<OutputIterator>::value,
-      thrust::iterator_value<InputIterator>,
-      thrust::iterator_value<OutputIterator>
-  >::type ValueType;
-
   // assume plus as the associative operator
-  return thrust::inclusive_scan(exec, first, last, result, thrust::plus<ValueType>());
+  return thrust::inclusive_scan(exec, first, last, result, thrust::plus<>());
 } // end inclusive_scan()
 
 
@@ -72,21 +58,9 @@ __host__ __device__
                                 InputIterator last,
                                 OutputIterator result)
 {
-  // the pseudocode for deducing the type of the temporary used below:
-  // 
-  // if OutputIterator is a "pure" output iterator
-  //   TemporaryType = InputIterator::value_type
-  // else
-  //   TemporaryType = OutputIterator::value_type
-
-  typedef typename thrust::detail::eval_if<
-      thrust::detail::is_output_iterator<OutputIterator>::value,
-      thrust::iterator_value<InputIterator>,
-      thrust::iterator_value<OutputIterator>
-  >::type ValueType;
-
-  // assume 0 as the initialization value
-  return thrust::exclusive_scan(exec, first, last, result, ValueType(0));
+  // Use the input iterator's value type per https://wg21.link/P0571
+  using ValueType = typename thrust::iterator_value<InputIterator>::type;
+  return thrust::exclusive_scan(exec, first, last, result, ValueType{});
 } // end exclusive_scan()
 
 
@@ -102,7 +76,7 @@ __host__ __device__
                                 T init)
 {
   // assume plus as the associative operator
-  return thrust::exclusive_scan(exec, first, last, result, init, thrust::plus<T>());
+  return thrust::exclusive_scan(exec, first, last, result, init, thrust::plus<>());
 } // end exclusive_scan()
 
 
@@ -149,5 +123,5 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/scan_by_key.h b/thrust/system/detail/generic/scan_by_key.h
index 3c2ea7931..9e38ac933 100644
--- a/thrust/system/detail/generic/scan_by_key.h
+++ b/thrust/system/detail/generic/scan_by_key.h
@@ -24,8 +24,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/tag.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -138,7 +137,7 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/scan_by_key.inl>
 
diff --git a/thrust/system/detail/generic/scan_by_key.inl b/thrust/system/detail/generic/scan_by_key.inl
index 129cef17b..0e3100224 100644
--- a/thrust/system/detail/generic/scan_by_key.inl
+++ b/thrust/system/detail/generic/scan_by_key.inl
@@ -14,8 +14,10 @@
  *  limitations under the License.
  */
 
+#pragma once
 
 #include <thrust/detail/config.h>
+#include <thrust/detail/cstdint.h>
 #include <thrust/system/detail/generic/scan_by_key.h>
 #include <thrust/functional.h>
 #include <thrust/transform.h>
@@ -26,8 +28,7 @@
 #include <thrust/detail/internal_functional.h>
 #include <thrust/scan.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -42,12 +43,12 @@ template <typename OutputType, typename HeadFlagType, typename AssociativeOperat
 struct segmented_scan_functor
 {
   AssociativeOperator binary_op;
-  
+
   typedef typename thrust::tuple<OutputType, HeadFlagType> result_type;
-  
+
   __host__ __device__
   segmented_scan_functor(AssociativeOperator _binary_op) : binary_op(_binary_op) {}
-  
+
   __host__ __device__
   result_type operator()(result_type a, result_type b)
   {
@@ -71,8 +72,7 @@ __host__ __device__
                                        InputIterator2 first2,
                                        OutputIterator result)
 {
-  typedef typename thrust::iterator_traits<InputIterator1>::value_type InputType1;
-  return thrust::inclusive_scan_by_key(exec, first1, last1, first2, result, thrust::equal_to<InputType1>());
+  return thrust::inclusive_scan_by_key(exec, first1, last1, first2, result, thrust::equal_to<>());
 }
 
 
@@ -89,8 +89,7 @@ __host__ __device__
                                        OutputIterator result,
                                        BinaryPredicate binary_pred)
 {
-  typedef typename thrust::iterator_traits<OutputIterator>::value_type OutputType;
-  return thrust::inclusive_scan_by_key(exec, first1, last1, first2, result, binary_pred, thrust::plus<OutputType>());
+  return thrust::inclusive_scan_by_key(exec, first1, last1, first2, result, binary_pred, thrust::plus<>());
 }
 
 
@@ -109,8 +108,8 @@ __host__ __device__
                                        BinaryPredicate binary_pred,
                                        AssociativeOperator binary_op)
 {
-  typedef typename thrust::iterator_traits<OutputIterator>::value_type OutputType;
-  typedef unsigned int HeadFlagType;
+  using OutputType = typename thrust::iterator_traits<InputIterator2>::value_type;
+  using HeadFlagType = thrust::detail::uint8_t;
 
   const size_t n = last1 - first1;
 
@@ -120,7 +119,7 @@ __host__ __device__
     thrust::detail::temporary_array<HeadFlagType,DerivedPolicy> flags(exec, n);
     flags[0] = 1; thrust::transform(exec, first1, last1 - 1, first1 + 1, flags.begin() + 1, thrust::detail::not2(binary_pred));
 
-    // scan key-flag tuples, 
+    // scan key-flag tuples,
     // For additional details refer to Section 2 of the following paper
     //    S. Sengupta, M. Harris, and M. Garland. "Efficient parallel scan algorithms for GPUs"
     //    NVIDIA Technical Report NVR-2008-003, December 2008
@@ -147,8 +146,8 @@ __host__ __device__
                                        InputIterator2 first2,
                                        OutputIterator result)
 {
-  typedef typename thrust::iterator_traits<OutputIterator>::value_type OutputType;
-  return thrust::exclusive_scan_by_key(exec, first1, last1, first2, result, OutputType(0));
+  typedef typename thrust::iterator_traits<InputIterator2>::value_type InitType;
+  return thrust::exclusive_scan_by_key(exec, first1, last1, first2, result, InitType{});
 }
 
 
@@ -165,8 +164,7 @@ __host__ __device__
                                        OutputIterator result,
                                        T init)
 {
-  typedef typename thrust::iterator_traits<InputIterator1>::value_type InputType1;
-  return thrust::exclusive_scan_by_key(exec, first1, last1, first2, result, init, thrust::equal_to<InputType1>());
+  return thrust::exclusive_scan_by_key(exec, first1, last1, first2, result, init, thrust::equal_to<>());
 }
 
 
@@ -185,8 +183,7 @@ __host__ __device__
                                        T init,
                                        BinaryPredicate binary_pred)
 {
-  typedef typename thrust::iterator_traits<OutputIterator>::value_type OutputType;
-  return thrust::exclusive_scan_by_key(exec, first1, last1, first2, result, init, binary_pred, thrust::plus<OutputType>());
+  return thrust::exclusive_scan_by_key(exec, first1, last1, first2, result, init, binary_pred, thrust::plus<>());
 }
 
 
@@ -207,8 +204,8 @@ __host__ __device__
                                        BinaryPredicate binary_pred,
                                        AssociativeOperator binary_op)
 {
-  typedef typename thrust::iterator_traits<OutputIterator>::value_type OutputType;
-  typedef unsigned int HeadFlagType;
+  using OutputType = T;
+  using HeadFlagType = thrust::detail::uint8_t;
 
   const size_t n = last1 - first1;
 
@@ -225,7 +222,7 @@ __host__ __device__
     thrust::replace_copy_if(exec, first2, last2 - 1, flags.begin() + 1, temp.begin() + 1, thrust::negate<HeadFlagType>(), init);
     temp[0] = init;
 
-    // scan key-flag tuples, 
+    // scan key-flag tuples,
     // For additional details refer to Section 2 of the following paper
     //    S. Sengupta, M. Harris, and M. Garland. "Efficient parallel scan algorithms for GPUs"
     //    NVIDIA Technical Report NVR-2008-003, December 2008
@@ -244,5 +241,5 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/scatter.h b/thrust/system/detail/generic/scatter.h
index 4a65a4cc0..6bb7949ef 100644
--- a/thrust/system/detail/generic/scatter.h
+++ b/thrust/system/detail/generic/scatter.h
@@ -20,8 +20,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/tag.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -75,7 +74,7 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/scatter.inl>
 
diff --git a/thrust/system/detail/generic/scatter.inl b/thrust/system/detail/generic/scatter.inl
index 7a1f52298..5b4798708 100644
--- a/thrust/system/detail/generic/scatter.inl
+++ b/thrust/system/detail/generic/scatter.inl
@@ -14,6 +14,8 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/scatter.h>
 #include <thrust/iterator/iterator_traits.h>
@@ -21,8 +23,7 @@
 #include <thrust/transform.h>
 #include <thrust/iterator/permutation_iterator.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -92,5 +93,5 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/select_system.h b/thrust/system/detail/generic/select_system.h
index 3b5d77503..7619b80e5 100644
--- a/thrust/system/detail/generic/select_system.h
+++ b/thrust/system/detail/generic/select_system.h
@@ -24,8 +24,7 @@
 #include <thrust/iterator/detail/device_system_tag.h>
 #include <thrust/iterator/detail/any_system_tag.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -120,6 +119,6 @@ thrust::device_system_tag select_system(thrust::any_system_tag);
 } // end generic
 } // end detail
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/select_system.inl>
diff --git a/thrust/system/detail/generic/select_system.inl b/thrust/system/detail/generic/select_system.inl
index fbe3094be..b69d17c45 100644
--- a/thrust/system/detail/generic/select_system.inl
+++ b/thrust/system/detail/generic/select_system.inl
@@ -21,8 +21,7 @@
 #include <thrust/detail/type_traits.h>
 #include <thrust/system/detail/generic/select_system_exists.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -175,5 +174,5 @@ thrust::device_system_tag select_system(thrust::any_system_tag)
 } // end generic
 } // end detail
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/select_system_exists.h b/thrust/system/detail/generic/select_system_exists.h
index ba8ef8bb7..29d05781d 100644
--- a/thrust/system/detail/generic/select_system_exists.h
+++ b/thrust/system/detail/generic/select_system_exists.h
@@ -23,8 +23,7 @@
 
 #include <thrust/detail/config.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 // forward declaration of any_system_tag for any_conversion below
 struct any_system_tag;
@@ -164,5 +163,5 @@ template<typename Tag1, typename Tag2, typename Tag3, typename Tag4, typename Ta
 } // end generic
 } // end detail
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/sequence.h b/thrust/system/detail/generic/sequence.h
index a7bc842ae..26bf17bb8 100644
--- a/thrust/system/detail/generic/sequence.h
+++ b/thrust/system/detail/generic/sequence.h
@@ -20,8 +20,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/tag.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -58,7 +57,7 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/sequence.inl>
 
diff --git a/thrust/system/detail/generic/sequence.inl b/thrust/system/detail/generic/sequence.inl
index 507f8b01d..0e11dd75d 100644
--- a/thrust/system/detail/generic/sequence.inl
+++ b/thrust/system/detail/generic/sequence.inl
@@ -14,43 +14,20 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/sequence.h>
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/tabulate.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
 {
 namespace generic
 {
-namespace sequence_detail
-{
-
-
-template<typename T>
-struct sequence_functor
-{
-  T init, step;
-
-  __host__ __device__
-  sequence_functor(T init, T step)
-    : init(init), step(step)
-  {}
-
-  template<typename Index>
-  __host__ __device__
-  T operator()(Index i) const
-  {
-    return static_cast<T>(init + step * i);
-  }
-};
-
-
-} // end sequence_detail
 
 
 template<typename DerivedPolicy, typename ForwardIterator>
@@ -75,6 +52,35 @@ __host__ __device__
   thrust::sequence(exec, first, last, init, T(1));
 } // end sequence()
 
+namespace detail
+{
+template <typename T, typename = void>
+struct compute_sequence_value
+{
+  T init;
+  T step;
+
+  __thrust_exec_check_disable__
+  __host__ __device__
+  T operator()(std::size_t i) const
+  {
+    return init + step * i;
+  }
+};
+template <typename T>
+struct compute_sequence_value<T, typename std::enable_if<std::is_arithmetic<T>::value>::type>
+{
+  T init;
+  T step;
+
+  __thrust_exec_check_disable__
+  __host__ __device__
+  T operator()(std::size_t i) const
+  {
+    return init + step * static_cast<T>(i);
+  }
+};
+}
 
 template<typename DerivedPolicy, typename ForwardIterator, typename T>
 __host__ __device__
@@ -84,13 +90,17 @@ __host__ __device__
                 T init,
                 T step)
 {
-  // XXX TODO use a placeholder expression here
-  thrust::tabulate(exec, first, last, sequence_detail::sequence_functor<T>(init, step));
+
+  thrust::tabulate(exec,
+                   first,
+                   last,
+                   detail::compute_sequence_value<T>{std::move(init),
+                                                     std::move(step)});
 } // end sequence()
 
 
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/set_operations.h b/thrust/system/detail/generic/set_operations.h
index 4dbee0ae4..37665d78d 100644
--- a/thrust/system/detail/generic/set_operations.h
+++ b/thrust/system/detail/generic/set_operations.h
@@ -20,8 +20,7 @@
 #include <thrust/system/detail/generic/tag.h>
 #include <thrust/pair.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -313,7 +312,7 @@ thrust::pair<OutputIterator1,OutputIterator2>
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/set_operations.inl>
 
diff --git a/thrust/system/detail/generic/set_operations.inl b/thrust/system/detail/generic/set_operations.inl
index 6264aff16..4363be5c0 100644
--- a/thrust/system/detail/generic/set_operations.inl
+++ b/thrust/system/detail/generic/set_operations.inl
@@ -25,8 +25,7 @@
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/zip_iterator.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -473,5 +472,5 @@ OutputIterator set_union(thrust::execution_policy<DerivedPolicy> &,
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/shuffle.h b/thrust/system/detail/generic/shuffle.h
new file mode 100644
index 000000000..8f8e21afd
--- /dev/null
+++ b/thrust/system/detail/generic/shuffle.h
@@ -0,0 +1,54 @@
+/*
+ *  Copyright 2008-2020 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file shuffle.h
+ *  \brief Generic implementations of shuffle functions.
+ */
+
+#pragma once
+
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp11_required.h>
+
+#if THRUST_CPP_DIALECT >= 2011
+
+#include <thrust/system/detail/generic/tag.h>
+
+THRUST_NAMESPACE_BEGIN
+namespace system {
+namespace detail {
+namespace generic {
+
+template <typename ExecutionPolicy, typename RandomIterator, typename URBG>
+__host__ __device__ void shuffle(
+    thrust::execution_policy<ExecutionPolicy>& exec, RandomIterator first,
+    RandomIterator last, URBG&& g);
+
+template <typename ExecutionPolicy, typename RandomIterator,
+          typename OutputIterator, typename URBG>
+__host__ __device__ void shuffle_copy(
+    thrust::execution_policy<ExecutionPolicy>& exec, RandomIterator first,
+    RandomIterator last, OutputIterator result, URBG&& g);
+
+}  // end namespace generic
+}  // end namespace detail
+}  // end namespace system
+THRUST_NAMESPACE_END
+
+#include <thrust/system/detail/generic/shuffle.inl>
+
+#endif
diff --git a/thrust/system/detail/generic/shuffle.inl b/thrust/system/detail/generic/shuffle.inl
new file mode 100644
index 000000000..baece51be
--- /dev/null
+++ b/thrust/system/detail/generic/shuffle.inl
@@ -0,0 +1,190 @@
+/*
+ *  Copyright 2008-20120 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/temporary_array.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/iterator/transform_output_iterator.h>
+#include <thrust/random.h>
+#include <thrust/scan.h>
+#include <thrust/system/detail/generic/shuffle.h>
+
+#include <cstdint>
+
+THRUST_NAMESPACE_BEGIN
+namespace system {
+namespace detail {
+namespace generic {
+
+// An implementation of a Feistel cipher for operating on 64 bit keys
+class feistel_bijection {
+  struct round_state {
+    std::uint32_t left;
+    std::uint32_t right;
+  };
+
+ public:
+  template <class URBG>
+  __host__ __device__ feistel_bijection(std::uint64_t m, URBG&& g) {
+    std::uint64_t total_bits = get_cipher_bits(m);
+    // Half bits rounded down
+    left_side_bits = total_bits / 2;
+    left_side_mask = (1ull << left_side_bits) - 1;
+    // Half the bits rounded up
+    right_side_bits = total_bits - left_side_bits;
+    right_side_mask = (1ull << right_side_bits) - 1;
+
+    for (std::uint32_t i = 0; i < num_rounds; i++) {
+      key[i] = g();
+    }
+  }
+
+  __host__ __device__ std::uint64_t nearest_power_of_two() const {
+    return 1ull << (left_side_bits + right_side_bits);
+  }
+
+  __host__ __device__ std::uint64_t operator()(const std::uint64_t val) const {
+    std::uint32_t state[2] = { static_cast<std::uint32_t>( val >> right_side_bits ), static_cast<std::uint32_t>( val & right_side_mask ) };
+    for( std::uint32_t i = 0; i < num_rounds; i++ )
+    {
+        std::uint32_t hi, lo;
+        constexpr std::uint64_t M0 = UINT64_C( 0xD2B74407B1CE6E93 );
+        mulhilo( M0, state[0], hi, lo );
+        lo = ( lo << ( right_side_bits - left_side_bits ) ) | state[1] >> left_side_bits;
+        state[0] = ( ( hi ^ key[i] ) ^ state[1] ) & left_side_mask;
+        state[1] = lo & right_side_mask;
+    }
+    // Combine the left and right sides together to get result
+    return static_cast<std::uint64_t>(state[0] << right_side_bits) | static_cast<std::uint64_t>(state[1]);
+  }
+
+ private:
+   // Perform 64 bit multiplication and save result in two 32 bit int
+   static __host__ __device__ void mulhilo( std::uint64_t a, std::uint64_t b, std::uint32_t& hi, std::uint32_t& lo )
+   {
+       std::uint64_t product = a * b;
+       hi = static_cast<std::uint32_t>( product >> 32 );
+       lo = static_cast<std::uint32_t>( product );
+   }
+
+  // Find the nearest power of two
+  static __host__ __device__ std::uint64_t get_cipher_bits(std::uint64_t m) {
+    if (m <= 16) return 4;
+    std::uint64_t i = 0;
+    m--;
+    while (m != 0) {
+      i++;
+      m >>= 1;
+    }
+    return i;
+  }
+
+  static constexpr std::uint32_t num_rounds = 24;
+  std::uint64_t right_side_bits;
+  std::uint64_t left_side_bits;
+  std::uint64_t right_side_mask;
+  std::uint64_t left_side_mask;
+  std::uint32_t key[num_rounds];
+};
+
+struct key_flag_tuple {
+  std::uint64_t key;
+  std::uint64_t flag;
+};
+
+// scan only flags
+struct key_flag_scan_op {
+  __host__ __device__ key_flag_tuple operator()(const key_flag_tuple& a,
+                                                const key_flag_tuple& b) {
+    return {b.key, a.flag + b.flag};
+  }
+};
+
+struct construct_key_flag_op {
+  std::uint64_t m;
+  feistel_bijection bijection;
+  __host__ __device__ construct_key_flag_op(std::uint64_t m,
+                                            feistel_bijection bijection)
+      : m(m), bijection(bijection) {}
+  __host__ __device__ key_flag_tuple operator()(std::uint64_t idx) {
+    auto gather_key = bijection(idx);
+    return key_flag_tuple{gather_key, (gather_key < m) ? 1ull : 0ull};
+  }
+};
+
+template <typename InputIterT, typename OutputIterT>
+struct write_output_op {
+  std::uint64_t m;
+  InputIterT in;
+  OutputIterT out;
+  // flag contains inclusive scan of valid keys
+  // perform gather using valid keys
+  __thrust_exec_check_disable__
+  __host__ __device__ std::size_t operator()(key_flag_tuple x) {
+    if (x.key < m) {
+      // -1 because inclusive scan
+      out[x.flag - 1] = in[x.key];
+    }
+    return 0;  // Discarded
+  }
+};
+
+template <typename ExecutionPolicy, typename RandomIterator, typename URBG>
+__host__ __device__ void shuffle(
+    thrust::execution_policy<ExecutionPolicy>& exec, RandomIterator first,
+    RandomIterator last, URBG&& g) {
+  using InputType = typename thrust::iterator_value_t<RandomIterator>;
+
+  // copy input to temp buffer
+  thrust::detail::temporary_array<InputType, ExecutionPolicy> temp(exec, first,
+                                                                   last);
+  thrust::shuffle_copy(exec, temp.begin(), temp.end(), first, g);
+}
+
+template <typename ExecutionPolicy, typename RandomIterator,
+          typename OutputIterator, typename URBG>
+__host__ __device__ void shuffle_copy(
+    thrust::execution_policy<ExecutionPolicy>& exec, RandomIterator first,
+    RandomIterator last, OutputIterator result, URBG&& g) {
+  // m is the length of the input
+  // we have an available bijection of length n via a feistel cipher
+  std::size_t m = last - first;
+  feistel_bijection bijection(m, g);
+  std::uint64_t n = bijection.nearest_power_of_two();
+
+  // perform stream compaction over length n bijection to get length m
+  // pseudorandom bijection over the original input
+  thrust::counting_iterator<std::uint64_t> indices(0);
+  thrust::transform_iterator<construct_key_flag_op, decltype(indices),
+                             key_flag_tuple>
+      key_flag_it(indices, construct_key_flag_op(m, bijection));
+  write_output_op<RandomIterator, decltype(result)> write_functor{m, first,
+                                                                  result};
+  auto gather_output_it = thrust::make_transform_output_iterator(
+      thrust::discard_iterator<std::size_t>(), write_functor);
+  // the feistel_bijection outputs a stream of permuted indices in range [0,n)
+  // flag each value < m and compact it, so we have a set of permuted indices in
+  // range [0,m) each thread gathers an input element according to its
+  // pseudorandom permuted index
+  thrust::inclusive_scan(exec, key_flag_it, key_flag_it + n, gather_output_it,
+                         key_flag_scan_op());
+}
+
+}  // end namespace generic
+}  // end namespace detail
+}  // end namespace system
+THRUST_NAMESPACE_END
diff --git a/thrust/system/detail/generic/sort.h b/thrust/system/detail/generic/sort.h
index 9d4ac1998..cd8d45562 100644
--- a/thrust/system/detail/generic/sort.h
+++ b/thrust/system/detail/generic/sort.h
@@ -19,8 +19,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/tag.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -148,7 +147,7 @@ __host__ __device__
 } // end generic
 } // end detail
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/sort.inl>
 
diff --git a/thrust/system/detail/generic/sort.inl b/thrust/system/detail/generic/sort.inl
index 5f0fb7ebf..632cab435 100644
--- a/thrust/system/detail/generic/sort.inl
+++ b/thrust/system/detail/generic/sort.inl
@@ -28,8 +28,7 @@
 #include <thrust/tuple.h>
 #include <thrust/detail/internal_functional.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -216,5 +215,5 @@ __host__ __device__
 } // end generic
 } // end detail
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/swap_ranges.h b/thrust/system/detail/generic/swap_ranges.h
index 78769715c..edb5acf31 100644
--- a/thrust/system/detail/generic/swap_ranges.h
+++ b/thrust/system/detail/generic/swap_ranges.h
@@ -20,8 +20,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/tag.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -41,7 +40,7 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/swap_ranges.inl>
 
diff --git a/thrust/system/detail/generic/swap_ranges.inl b/thrust/system/detail/generic/swap_ranges.inl
index 81977adc2..ea42df35b 100644
--- a/thrust/system/detail/generic/swap_ranges.inl
+++ b/thrust/system/detail/generic/swap_ranges.inl
@@ -14,6 +14,8 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/swap_ranges.h>
 #include <thrust/tuple.h>
@@ -21,8 +23,7 @@
 #include <thrust/detail/internal_functional.h>
 #include <thrust/for_each.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -74,5 +75,5 @@ __host__ __device__
 } // end generic
 } // end detail
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/tabulate.h b/thrust/system/detail/generic/tabulate.h
index 5cb75e928..041093e82 100644
--- a/thrust/system/detail/generic/tabulate.h
+++ b/thrust/system/detail/generic/tabulate.h
@@ -20,8 +20,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/tag.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -43,7 +42,7 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/tabulate.inl>
 
diff --git a/thrust/system/detail/generic/tabulate.inl b/thrust/system/detail/generic/tabulate.inl
index 1a740d26a..0fd2121c1 100644
--- a/thrust/system/detail/generic/tabulate.inl
+++ b/thrust/system/detail/generic/tabulate.inl
@@ -14,6 +14,8 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/tabulate.h>
 #include <thrust/iterator/iterator_traits.h>
@@ -21,8 +23,7 @@
 #include <thrust/distance.h>
 #include <thrust/iterator/counting_iterator.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -55,6 +56,6 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 
diff --git a/thrust/system/detail/generic/tag.h b/thrust/system/detail/generic/tag.h
index 4da1e79ce..48f094797 100644
--- a/thrust/system/detail/generic/tag.h
+++ b/thrust/system/detail/generic/tag.h
@@ -23,8 +23,7 @@
 
 #include <thrust/detail/config.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -44,5 +43,5 @@ struct tag
 } // end generic
 } // end detail
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/temporary_buffer.h b/thrust/system/detail/generic/temporary_buffer.h
index 953401139..6b7e01ff2 100644
--- a/thrust/system/detail/generic/temporary_buffer.h
+++ b/thrust/system/detail/generic/temporary_buffer.h
@@ -21,8 +21,7 @@
 #include <thrust/pair.h>
 #include <thrust/detail/pointer.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -37,6 +36,13 @@ __host__ __device__
     get_temporary_buffer(thrust::execution_policy<DerivedPolicy> &exec, typename thrust::pointer<T,DerivedPolicy>::difference_type n);
 
 
+__thrust_exec_check_disable__
+template<typename DerivedPolicy, typename Pointer>
+__host__ __device__
+  void return_temporary_buffer(thrust::execution_policy<DerivedPolicy> &exec, Pointer p, std::ptrdiff_t n);
+
+
+__thrust_exec_check_disable__
 template<typename DerivedPolicy, typename Pointer>
 __host__ __device__
   void return_temporary_buffer(thrust::execution_policy<DerivedPolicy> &exec, Pointer p);
@@ -45,7 +51,7 @@ __host__ __device__
 } // end generic
 } // end detail
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/temporary_buffer.inl>
 
diff --git a/thrust/system/detail/generic/temporary_buffer.inl b/thrust/system/detail/generic/temporary_buffer.inl
index 838d013bc..254c48cb9 100644
--- a/thrust/system/detail/generic/temporary_buffer.inl
+++ b/thrust/system/detail/generic/temporary_buffer.inl
@@ -14,14 +14,15 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/temporary_buffer.h>
 #include <thrust/detail/pointer.h>
 #include <thrust/detail/malloc_and_free.h>
 #include <thrust/pair.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -47,10 +48,33 @@ __host__ __device__
 } // end get_temporary_buffer()
 
 
+__thrust_exec_check_disable__
+template<typename DerivedPolicy, typename Pointer>
+__host__ __device__
+  void return_temporary_buffer(thrust::execution_policy<DerivedPolicy> &exec, Pointer p, std::ptrdiff_t)
+{
+  // If we are here, no user customization of the three-argument signature with
+  // a size parameter of `return_temporary_buffer` was found. There may be an
+  // old two-argument signature `return_temporary_buffer` though, so we make
+  // another ADL call to try and find one.
+  //
+  // The interface layer downcast and then did ADL dispatch - there were no
+  // matches for DerivedPolicy (aka no one customized the three-argument
+  // signature), so this overload got found an implicit upcast to
+  // `execution_policy<DerivedPolicy>` was done. Now, we're looking for a
+  // customization of the two-argument signature so we need to downcast again.
+  return_temporary_buffer(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), p);
+} // end return_temporary_buffer()
+
+
+__thrust_exec_check_disable__
 template<typename DerivedPolicy, typename Pointer>
 __host__ __device__
   void return_temporary_buffer(thrust::execution_policy<DerivedPolicy> &exec, Pointer p)
 {
+  // If we are here, no user customization of either the old two-argument
+  // signature or the new three-argument signature with a size parameter of
+  // `return_temporary_buffer` was found.
   thrust::free(exec, p);
 } // end return_temporary_buffer()
 
@@ -58,5 +82,5 @@ __host__ __device__
 } // end generic
 } // end detail
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/transform.h b/thrust/system/detail/generic/transform.h
index 1aa2f4993..30e032696 100644
--- a/thrust/system/detail/generic/transform.h
+++ b/thrust/system/detail/generic/transform.h
@@ -19,8 +19,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/tag.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -100,7 +99,7 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/transform.inl>
 
diff --git a/thrust/system/detail/generic/transform.inl b/thrust/system/detail/generic/transform.inl
index 589eb65c7..122c42580 100644
--- a/thrust/system/detail/generic/transform.inl
+++ b/thrust/system/detail/generic/transform.inl
@@ -14,6 +14,8 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/transform.h>
 #include <thrust/for_each.h>
@@ -23,8 +25,7 @@
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/detail/internal_functional.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -186,5 +187,5 @@ __host__ __device__
 } // end generic
 } // end detail
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/transform_reduce.h b/thrust/system/detail/generic/transform_reduce.h
index 23123fa49..af510296e 100644
--- a/thrust/system/detail/generic/transform_reduce.h
+++ b/thrust/system/detail/generic/transform_reduce.h
@@ -20,8 +20,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/tag.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -47,7 +46,7 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/transform_reduce.inl>
 
diff --git a/thrust/system/detail/generic/transform_reduce.inl b/thrust/system/detail/generic/transform_reduce.inl
index 7340f8355..539c3b22c 100644
--- a/thrust/system/detail/generic/transform_reduce.inl
+++ b/thrust/system/detail/generic/transform_reduce.inl
@@ -14,13 +14,14 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/transform_reduce.h>
 #include <thrust/reduce.h>
 #include <thrust/iterator/transform_iterator.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -30,8 +31,8 @@ namespace generic
 
 
 template<typename DerivedPolicy,
-         typename InputIterator, 
-         typename UnaryFunction, 
+         typename InputIterator,
+         typename UnaryFunction,
          typename OutputType,
          typename BinaryFunction>
 __host__ __device__
@@ -52,5 +53,5 @@ __host__ __device__
 } // end generic
 } // end detail
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/transform_scan.h b/thrust/system/detail/generic/transform_scan.h
index 3f81434fc..05054c965 100644
--- a/thrust/system/detail/generic/transform_scan.h
+++ b/thrust/system/detail/generic/transform_scan.h
@@ -20,8 +20,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/tag.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -62,7 +61,7 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/transform_scan.inl>
 
diff --git a/thrust/system/detail/generic/transform_scan.inl b/thrust/system/detail/generic/transform_scan.inl
index e411613c6..c9c976687 100644
--- a/thrust/system/detail/generic/transform_scan.inl
+++ b/thrust/system/detail/generic/transform_scan.inl
@@ -18,15 +18,15 @@
 #pragma once
 
 #include <thrust/detail/config.h>
-#include <thrust/system/detail/generic/transform_scan.h>
-#include <thrust/scan.h>
-#include <thrust/iterator/transform_iterator.h>
 #include <thrust/detail/type_traits.h>
 #include <thrust/detail/type_traits/function_traits.h>
 #include <thrust/detail/type_traits/iterator/is_output_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/scan.h>
+#include <thrust/system/detail/generic/transform_scan.h>
+#include <thrust/type_traits/remove_cvref.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -48,27 +48,10 @@ __host__ __device__
                                           UnaryFunction unary_op,
                                           BinaryFunction binary_op)
 {
-  // the pseudocode for deducing the type of the temporary used below:
-  // 
-  // if UnaryFunction is AdaptableUnaryFunction
-  //   TemporaryType = AdaptableUnaryFunction::result_type
-  // else if OutputIterator is a "pure" output iterator
-  //   TemporaryType = InputIterator::value_type
-  // else
-  //   TemporaryType = OutputIterator::value_type
-  //
-  // XXX upon c++0x, TemporaryType needs to be:
-  // result_of_adaptable_function<UnaryFunction>::type
-
-  typedef typename thrust::detail::eval_if<
-    thrust::detail::has_result_type<UnaryFunction>::value,
-    thrust::detail::result_type<UnaryFunction>,
-    thrust::detail::eval_if<
-      thrust::detail::is_output_iterator<OutputIterator>::value,
-      thrust::iterator_value<InputIterator>,
-      thrust::iterator_value<OutputIterator>
-    >
-  >::type ValueType;
+  // Use the input iterator's value type per https://wg21.link/P0571
+  using InputType = typename thrust::iterator_value<InputIterator>::type;
+  using ResultType = thrust::detail::invoke_result_t<UnaryFunction, InputType>;
+  using ValueType = thrust::remove_cvref_t<ResultType>;
 
   thrust::transform_iterator<UnaryFunction, InputIterator, ValueType> _first(first, unary_op);
   thrust::transform_iterator<UnaryFunction, InputIterator, ValueType> _last(last, unary_op);
@@ -81,7 +64,7 @@ template<typename ExecutionPolicy,
          typename InputIterator,
          typename OutputIterator,
          typename UnaryFunction,
-         typename T,
+         typename InitialValueType,
          typename AssociativeOperator>
 __host__ __device__
   OutputIterator transform_exclusive_scan(thrust::execution_policy<ExecutionPolicy> &exec,
@@ -89,30 +72,11 @@ __host__ __device__
                                           InputIterator last,
                                           OutputIterator result,
                                           UnaryFunction unary_op,
-                                          T init,
+                                          InitialValueType init,
                                           AssociativeOperator binary_op)
 {
-  // the pseudocode for deducing the type of the temporary used below:
-  // 
-  // if UnaryFunction is AdaptableUnaryFunction
-  //   TemporaryType = AdaptableUnaryFunction::result_type
-  // else if OutputIterator is a "pure" output iterator
-  //   TemporaryType = InputIterator::value_type
-  // else
-  //   TemporaryType = OutputIterator::value_type
-  //
-  // XXX upon c++0x, TemporaryType needs to be:
-  // result_of_adaptable_function<UnaryFunction>::type
-
-  typedef typename thrust::detail::eval_if<
-    thrust::detail::has_result_type<UnaryFunction>::value,
-    thrust::detail::result_type<UnaryFunction>,
-    thrust::detail::eval_if<
-      thrust::detail::is_output_iterator<OutputIterator>::value,
-      thrust::iterator_value<InputIterator>,
-      thrust::iterator_value<OutputIterator>
-    >
-  >::type ValueType;
+  // Use the initial value type per https://wg21.link/P0571
+  using ValueType = thrust::remove_cvref_t<InitialValueType>;
 
   thrust::transform_iterator<UnaryFunction, InputIterator, ValueType> _first(first, unary_op);
   thrust::transform_iterator<UnaryFunction, InputIterator, ValueType> _last(last, unary_op);
@@ -124,5 +88,5 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/uninitialized_copy.h b/thrust/system/detail/generic/uninitialized_copy.h
index 2d1b0010d..bac5bcf96 100644
--- a/thrust/system/detail/generic/uninitialized_copy.h
+++ b/thrust/system/detail/generic/uninitialized_copy.h
@@ -20,8 +20,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/tag.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -51,7 +50,7 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/uninitialized_copy.inl>
 
diff --git a/thrust/system/detail/generic/uninitialized_copy.inl b/thrust/system/detail/generic/uninitialized_copy.inl
index d6babf65c..679d1f6ba 100644
--- a/thrust/system/detail/generic/uninitialized_copy.inl
+++ b/thrust/system/detail/generic/uninitialized_copy.inl
@@ -14,6 +14,8 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/uninitialized_copy.h>
 #include <thrust/copy.h>
@@ -22,8 +24,7 @@
 #include <thrust/detail/type_traits.h>
 #include <thrust/iterator/iterator_traits.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -189,5 +190,5 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/uninitialized_fill.h b/thrust/system/detail/generic/uninitialized_fill.h
index 6acc65d08..4f5404508 100644
--- a/thrust/system/detail/generic/uninitialized_fill.h
+++ b/thrust/system/detail/generic/uninitialized_fill.h
@@ -20,8 +20,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/tag.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -51,7 +50,7 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/uninitialized_fill.inl>
 
diff --git a/thrust/system/detail/generic/uninitialized_fill.inl b/thrust/system/detail/generic/uninitialized_fill.inl
index 0d4cf3f54..062414945 100644
--- a/thrust/system/detail/generic/uninitialized_fill.inl
+++ b/thrust/system/detail/generic/uninitialized_fill.inl
@@ -14,6 +14,8 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/uninitialized_fill.h>
 #include <thrust/fill.h>
@@ -21,8 +23,7 @@
 #include <thrust/detail/type_traits.h>
 #include <thrust/iterator/iterator_traits.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -130,5 +131,5 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/unique.h b/thrust/system/detail/generic/unique.h
index 04388cbc0..ce3bff884 100644
--- a/thrust/system/detail/generic/unique.h
+++ b/thrust/system/detail/generic/unique.h
@@ -19,8 +19,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/tag.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -69,10 +68,30 @@ OutputIterator unique_copy(thrust::execution_policy<DerivedPolicy> &exec,
                            BinaryPredicate binary_pred);
 
 
+template<typename DerivedPolicy,
+         typename ForwardIterator>
+__host__ __device__
+typename thrust::iterator_traits<ForwardIterator>::difference_type
+    unique_count(thrust::execution_policy<DerivedPolicy> &exec,
+                 ForwardIterator first,
+                 ForwardIterator last);
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename BinaryPredicate>
+__host__ __device__
+typename thrust::iterator_traits<ForwardIterator>::difference_type
+    unique_count(thrust::execution_policy<DerivedPolicy> &exec,
+                 ForwardIterator first,
+                 ForwardIterator last,
+                 BinaryPredicate binary_pred);
+
+
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/unique.inl>
 
diff --git a/thrust/system/detail/generic/unique.inl b/thrust/system/detail/generic/unique.inl
index 4cd3459fd..bb66e3585 100644
--- a/thrust/system/detail/generic/unique.inl
+++ b/thrust/system/detail/generic/unique.inl
@@ -14,11 +14,6 @@
  *  limitations under the License.
  */
 
-
-/*! \file unique.inl
- *  \brief Inline file for unique.h.
- */
-
 #pragma once
 
 #include <thrust/detail/config.h>
@@ -29,12 +24,12 @@
 #include <thrust/detail/temporary_array.h>
 #include <thrust/detail/internal_functional.h>
 #include <thrust/detail/copy_if.h>
+#include <thrust/detail/count.h>
 #include <thrust/distance.h>
 #include <thrust/functional.h>
 #include <thrust/detail/range/head_flags.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -66,9 +61,9 @@ __host__ __device__
                          BinaryPredicate binary_pred)
 {
   typedef typename thrust::iterator_traits<ForwardIterator>::value_type InputType;
-  
+
   thrust::detail::temporary_array<InputType,DerivedPolicy> input(exec, first, last);
-  
+
   return thrust::unique_copy(exec, input.begin(), input.end(), first, binary_pred);
 } // end unique()
 
@@ -99,15 +94,46 @@ __host__ __device__
                              BinaryPredicate binary_pred)
 {
   thrust::detail::head_flags<InputIterator, BinaryPredicate> stencil(first, last, binary_pred);
+
+  using namespace thrust::placeholders;
+
+  return thrust::copy_if(exec, first, last, stencil.begin(), output, _1);
+} // end unique_copy()
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename BinaryPredicate>
+__host__ __device__
+  typename thrust::iterator_traits<ForwardIterator>::difference_type
+    unique_count(thrust::execution_policy<DerivedPolicy> &exec,
+                 ForwardIterator first,
+                 ForwardIterator last,
+                 BinaryPredicate binary_pred)
+{
+  thrust::detail::head_flags<ForwardIterator, BinaryPredicate> stencil(first, last, binary_pred);
   
   using namespace thrust::placeholders;
   
-  return thrust::copy_if(exec, first, last, stencil.begin(), output, _1);
+  return thrust::count_if(exec, stencil.begin(), stencil.end(), _1);
+} // end unique_copy()
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator>
+__host__ __device__
+  typename thrust::iterator_traits<ForwardIterator>::difference_type
+    unique_count(thrust::execution_policy<DerivedPolicy> &exec,
+                 ForwardIterator first,
+                 ForwardIterator last)
+{
+  typedef typename thrust::iterator_value<ForwardIterator>::type value_type;
+  return thrust::unique_count(exec, first, last, thrust::equal_to<value_type>());
 } // end unique_copy()
 
 
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/unique_by_key.h b/thrust/system/detail/generic/unique_by_key.h
index cb03179de..0ea9e7cc8 100644
--- a/thrust/system/detail/generic/unique_by_key.h
+++ b/thrust/system/detail/generic/unique_by_key.h
@@ -20,8 +20,7 @@
 #include <thrust/system/detail/generic/tag.h>
 #include <thrust/pair.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -89,7 +88,7 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/unique_by_key.inl>
 
diff --git a/thrust/system/detail/generic/unique_by_key.inl b/thrust/system/detail/generic/unique_by_key.inl
index ff8c5b554..ffcf1dd0c 100644
--- a/thrust/system/detail/generic/unique_by_key.inl
+++ b/thrust/system/detail/generic/unique_by_key.inl
@@ -28,8 +28,7 @@
 #include <thrust/unique.h>
 #include <thrust/detail/range/head_flags.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -136,5 +135,5 @@ unique_by_key_copy(thrust::execution_policy<ExecutionPolicy> &exec,
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/internal/decompose.h b/thrust/system/detail/internal/decompose.h
index e949f2024..58af7c551 100644
--- a/thrust/system/detail/internal/decompose.h
+++ b/thrust/system/detail/internal/decompose.h
@@ -18,8 +18,7 @@
 
 #include <thrust/detail/config.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -110,5 +109,5 @@ namespace internal
 } // end namespace internal
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/sequential/adjacent_difference.h b/thrust/system/detail/sequential/adjacent_difference.h
index c6b0ee1b2..4a9dad82c 100644
--- a/thrust/system/detail/sequential/adjacent_difference.h
+++ b/thrust/system/detail/sequential/adjacent_difference.h
@@ -25,8 +25,7 @@
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/system/detail/sequential/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -70,5 +69,5 @@ OutputIterator adjacent_difference(sequential::execution_policy<DerivedPolicy> &
 } // end namespace sequential
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/sequential/assign_value.h b/thrust/system/detail/sequential/assign_value.h
index 699bcbcd7..0eb145d13 100644
--- a/thrust/system/detail/sequential/assign_value.h
+++ b/thrust/system/detail/sequential/assign_value.h
@@ -20,8 +20,7 @@
 #include <thrust/system/detail/sequential/execution_policy.h>
 #include <thrust/detail/raw_pointer_cast.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -39,5 +38,5 @@ __host__ __device__
 } // end sequential
 } // end detail
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/sequential/binary_search.h b/thrust/system/detail/sequential/binary_search.h
index 54534143e..2da5080f4 100644
--- a/thrust/system/detail/sequential/binary_search.h
+++ b/thrust/system/detail/sequential/binary_search.h
@@ -21,13 +21,14 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/advance.h>
 #include <thrust/distance.h>
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/detail/function.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -153,5 +154,5 @@ bool binary_search(sequential::execution_policy<DerivedPolicy> &exec,
 } // end namespace sequential
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/sequential/copy.h b/thrust/system/detail/sequential/copy.h
index 80853f670..0dd2cdad5 100644
--- a/thrust/system/detail/sequential/copy.h
+++ b/thrust/system/detail/sequential/copy.h
@@ -23,8 +23,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/sequential/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -57,7 +56,7 @@ __host__ __device__
 } // end namespace sequential
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/sequential/copy.inl>
 
diff --git a/thrust/system/detail/sequential/copy.inl b/thrust/system/detail/sequential/copy.inl
index 8027681d0..850f20f1e 100644
--- a/thrust/system/detail/sequential/copy.inl
+++ b/thrust/system/detail/sequential/copy.inl
@@ -14,6 +14,8 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/sequential/copy.h>
 #include <thrust/detail/type_traits.h>
@@ -23,8 +25,7 @@
 #include <thrust/detail/type_traits/pointer_traits.h>
 #include <thrust/type_traits/is_trivially_relocatable.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -141,5 +142,5 @@ __host__ __device__
 } // end namespace sequential
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/sequential/copy_backward.h b/thrust/system/detail/sequential/copy_backward.h
index e825436b1..d127ac80d 100644
--- a/thrust/system/detail/sequential/copy_backward.h
+++ b/thrust/system/detail/sequential/copy_backward.h
@@ -18,8 +18,7 @@
 
 #include <thrust/detail/config.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -50,5 +49,5 @@ BidirectionalIterator2 copy_backward(BidirectionalIterator1 first,
 } // end namespace sequential
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/sequential/copy_if.h b/thrust/system/detail/sequential/copy_if.h
index bb29ccdeb..3c00956de 100644
--- a/thrust/system/detail/sequential/copy_if.h
+++ b/thrust/system/detail/sequential/copy_if.h
@@ -24,8 +24,7 @@
 #include <thrust/detail/function.h>
 #include <thrust/system/detail/sequential/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -69,5 +68,5 @@ __host__ __device__
 } // end namespace sequential
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/sequential/execution_policy.h b/thrust/system/detail/sequential/execution_policy.h
index 7b5f69666..99d78fc27 100644
--- a/thrust/system/detail/sequential/execution_policy.h
+++ b/thrust/system/detail/sequential/execution_policy.h
@@ -19,8 +19,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/detail/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -50,7 +49,7 @@ template<>
 // tag's definition comes before the generic definition of execution_policy
 struct tag : execution_policy<tag>
 {
-  __host__ __device__ tag() {}
+  __host__ __device__ constexpr tag() {}
 };
 
 // allow conversion to tag when it is not a successor
@@ -66,15 +65,11 @@ template<typename Derived>
 };
 
 
-#ifdef __CUDA_ARCH__
-static const __device__ tag seq;
-#else
-static const tag seq;
-#endif
+THRUST_INLINE_CONSTANT tag seq;
 
 
 } // end sequential
 } // end detail
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/sequential/extrema.h b/thrust/system/detail/sequential/extrema.h
index 7bfa5a17d..5e5c62da6 100644
--- a/thrust/system/detail/sequential/extrema.h
+++ b/thrust/system/detail/sequential/extrema.h
@@ -26,8 +26,7 @@
 #include <thrust/detail/function.h>
 #include <thrust/system/detail/sequential/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -135,5 +134,5 @@ thrust::pair<ForwardIterator,ForwardIterator> minmax_element(sequential::executi
 } // end namespace sequential
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/sequential/find.h b/thrust/system/detail/sequential/find.h
index 5e551b74a..54c238c71 100644
--- a/thrust/system/detail/sequential/find.h
+++ b/thrust/system/detail/sequential/find.h
@@ -25,8 +25,7 @@
 #include <thrust/detail/function.h>
 #include <thrust/system/detail/sequential/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -67,5 +66,5 @@ InputIterator find_if(execution_policy<DerivedPolicy> &,
 } // end namespace sequential
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/sequential/for_each.h b/thrust/system/detail/sequential/for_each.h
index 6e83d18c1..7058c56f2 100644
--- a/thrust/system/detail/sequential/for_each.h
+++ b/thrust/system/detail/sequential/for_each.h
@@ -25,8 +25,7 @@
 #include <thrust/detail/function.h>
 #include <thrust/system/detail/sequential/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -91,5 +90,5 @@ InputIterator for_each_n(sequential::execution_policy<DerivedPolicy> &,
 } // end namespace sequential
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/sequential/general_copy.h b/thrust/system/detail/sequential/general_copy.h
index 9546b72e5..6ea87bbac 100644
--- a/thrust/system/detail/sequential/general_copy.h
+++ b/thrust/system/detail/sequential/general_copy.h
@@ -24,8 +24,7 @@
 #include <thrust/detail/raw_reference_cast.h>
 #include <thrust/detail/type_traits.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -143,5 +142,5 @@ __host__ __device__
 } // end namespace sequential
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/sequential/get_value.h b/thrust/system/detail/sequential/get_value.h
index 5f3f8eb04..90752d867 100644
--- a/thrust/system/detail/sequential/get_value.h
+++ b/thrust/system/detail/sequential/get_value.h
@@ -20,8 +20,7 @@
 #include <thrust/system/detail/sequential/execution_policy.h>
 #include <thrust/detail/raw_pointer_cast.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -42,5 +41,5 @@ __host__ __device__
 } // end sequential
 } // end detail
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/sequential/insertion_sort.h b/thrust/system/detail/sequential/insertion_sort.h
index f0bb9bc5f..9acccd8e9 100644
--- a/thrust/system/detail/sequential/insertion_sort.h
+++ b/thrust/system/detail/sequential/insertion_sort.h
@@ -22,8 +22,7 @@
 #include <thrust/detail/function.h>
 #include <thrust/system/detail/sequential/copy_backward.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -149,5 +148,5 @@ void insertion_sort_by_key(RandomAccessIterator1 first1,
 } // end namespace sequential
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/sequential/iter_swap.h b/thrust/system/detail/sequential/iter_swap.h
index 1c8fde6e7..7a5c481fc 100644
--- a/thrust/system/detail/sequential/iter_swap.h
+++ b/thrust/system/detail/sequential/iter_swap.h
@@ -21,8 +21,7 @@
 #include <thrust/detail/raw_pointer_cast.h>
 #include <thrust/detail/swap.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -43,5 +42,5 @@ __host__ __device__
 } // end sequential
 } // end detail
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/sequential/malloc_and_free.h b/thrust/system/detail/sequential/malloc_and_free.h
index a54ddf0a9..b250140e0 100644
--- a/thrust/system/detail/sequential/malloc_and_free.h
+++ b/thrust/system/detail/sequential/malloc_and_free.h
@@ -21,8 +21,7 @@
 #include <cstdlib> // for malloc & free
 #include <thrust/detail/raw_pointer_cast.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -35,11 +34,7 @@ template<typename DerivedPolicy>
 inline __host__ __device__
 void *malloc(execution_policy<DerivedPolicy> &, std::size_t n)
 {
-#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 200)
   return std::malloc(n);
-#else
-  return 0;
-#endif
 } // end mallc()
 
 
@@ -47,14 +42,12 @@ template<typename DerivedPolicy, typename Pointer>
 inline __host__ __device__
 void free(sequential::execution_policy<DerivedPolicy> &, Pointer ptr)
 {
-#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 200)
   std::free(thrust::raw_pointer_cast(ptr));
-#endif
 } // end mallc()
 
 
 } // end sequential
 } // end detail
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/sequential/merge.h b/thrust/system/detail/sequential/merge.h
index 6cd314dc7..a45e18004 100644
--- a/thrust/system/detail/sequential/merge.h
+++ b/thrust/system/detail/sequential/merge.h
@@ -24,8 +24,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/sequential/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -74,7 +73,7 @@ thrust::pair<OutputIterator1,OutputIterator2>
 } // end namespace sequential
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/sequential/merge.inl>
 
diff --git a/thrust/system/detail/sequential/merge.inl b/thrust/system/detail/sequential/merge.inl
index ae28ba97d..08d7c0b0d 100644
--- a/thrust/system/detail/sequential/merge.inl
+++ b/thrust/system/detail/sequential/merge.inl
@@ -14,14 +14,15 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/sequential/merge.h>
 #include <thrust/detail/copy.h>
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/detail/function.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -149,5 +150,5 @@ thrust::pair<OutputIterator1,OutputIterator2>
 } // end namespace sequential
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/sequential/partition.h b/thrust/system/detail/sequential/partition.h
index 66996d637..43d5b0e23 100644
--- a/thrust/system/detail/sequential/partition.h
+++ b/thrust/system/detail/sequential/partition.h
@@ -27,8 +27,7 @@
 #include <thrust/detail/function.h>
 #include <thrust/system/detail/sequential/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 
@@ -95,7 +94,8 @@ __host__ __device__
   {
     if(wrapped_pred(*next))
     {
-      iter_swap(first, next);
+      // Fully qualify name to disambiguate overloads found via ADL.
+      THRUST_NS_QUALIFIER::system::detail::sequential::iter_swap(first, next);
       ++first;
     }
   }
@@ -143,7 +143,8 @@ __host__ __device__
   {
     if(wrapped_pred(*stencil_first))
     {
-      iter_swap(first, next);
+      // Fully qualify name to disambiguate overloads found via ADL.
+      THRUST_NS_QUALIFIER::system::detail::sequential::iter_swap(first, next);
       ++first;
     }
 
@@ -335,5 +336,5 @@ __host__ __device__
 } // end namespace sequential
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/sequential/reduce.h b/thrust/system/detail/sequential/reduce.h
index 55e92acb9..a532f71b2 100644
--- a/thrust/system/detail/sequential/reduce.h
+++ b/thrust/system/detail/sequential/reduce.h
@@ -25,8 +25,7 @@
 #include <thrust/detail/function.h>
 #include <thrust/system/detail/sequential/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -69,5 +68,5 @@ __host__ __device__
 } // end namespace sequential
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/sequential/reduce_by_key.h b/thrust/system/detail/sequential/reduce_by_key.h
index f19e62a29..ef17ac5b0 100644
--- a/thrust/system/detail/sequential/reduce_by_key.h
+++ b/thrust/system/detail/sequential/reduce_by_key.h
@@ -19,11 +19,9 @@
 #include <thrust/detail/config.h>
 #include <thrust/pair.h>
 #include <thrust/iterator/iterator_traits.h>
-#include <thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h>
 #include <thrust/system/detail/sequential/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -54,11 +52,8 @@ __host__ __device__
   typedef typename thrust::iterator_traits<InputIterator1>::value_type  InputKeyType;
   typedef typename thrust::iterator_traits<InputIterator2>::value_type  InputValueType;
 
-  typedef typename thrust::detail::intermediate_type_from_function_and_iterators<
-    InputIterator2,
-    OutputIterator2,
-    BinaryFunction
-  >::type TemporaryType;
+  // Use the input iterator's value type per https://wg21.link/P0571
+  using TemporaryType = typename thrust::iterator_value<InputIterator2>::type;
 
   if(keys_first != keys_last)
   {
@@ -103,5 +98,5 @@ __host__ __device__
 } // end namespace sequential
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/sequential/remove.h b/thrust/system/detail/sequential/remove.h
index 48de522df..df564f15b 100644
--- a/thrust/system/detail/sequential/remove.h
+++ b/thrust/system/detail/sequential/remove.h
@@ -25,8 +25,7 @@
 #include <thrust/detail/function.h>
 #include <thrust/system/detail/sequential/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -198,5 +197,5 @@ __host__ __device__
 } // end namespace sequential
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/sequential/scan.h b/thrust/system/detail/sequential/scan.h
index 3ac06a9eb..c5fce2475 100644
--- a/thrust/system/detail/sequential/scan.h
+++ b/thrust/system/detail/sequential/scan.h
@@ -29,8 +29,7 @@
 #include <thrust/detail/type_traits/iterator/is_output_iterator.h>
 #include <thrust/detail/function.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -51,29 +50,10 @@ __host__ __device__
                                 OutputIterator result,
                                 BinaryFunction binary_op)
 {
-  // the pseudocode for deducing the type of the temporary used below:
-  // 
-  // if BinaryFunction is AdaptableBinaryFunction
-  //   TemporaryType = AdaptableBinaryFunction::result_type
-  // else if OutputIterator is a "pure" output iterator
-  //   TemporaryType = InputIterator::value_type
-  // else
-  //   TemporaryType = OutputIterator::value_type
-  //
-  // XXX upon c++0x, TemporaryType needs to be:
-  // result_of_adaptable_function<BinaryFunction>::type
-  
   using namespace thrust::detail;
 
-  typedef typename eval_if<
-    has_result_type<BinaryFunction>::value,
-    result_type<BinaryFunction>,
-    eval_if<
-      is_output_iterator<OutputIterator>::value,
-      thrust::iterator_value<InputIterator>,
-      thrust::iterator_value<OutputIterator>
-    >
-  >::type ValueType;
+  // Use the input iterator's value type per https://wg21.link/P0571
+  using ValueType = typename thrust::iterator_value<InputIterator>::type;
 
   // wrap binary_op
   thrust::detail::wrapped_function<
@@ -99,39 +79,20 @@ __thrust_exec_check_disable__
 template<typename DerivedPolicy,
          typename InputIterator,
          typename OutputIterator,
-         typename T,
+         typename InitialValueType,
          typename BinaryFunction>
 __host__ __device__
   OutputIterator exclusive_scan(sequential::execution_policy<DerivedPolicy> &,
                                 InputIterator first,
                                 InputIterator last,
                                 OutputIterator result,
-                                T init,
+                                InitialValueType init,
                                 BinaryFunction binary_op)
 {
-  // the pseudocode for deducing the type of the temporary used below:
-  // 
-  // if BinaryFunction is AdaptableBinaryFunction
-  //   TemporaryType = AdaptableBinaryFunction::result_type
-  // else if OutputIterator is a "pure" output iterator
-  //   TemporaryType = InputIterator::value_type
-  // else
-  //   TemporaryType = OutputIterator::value_type
-  //
-  // XXX upon c++0x, TemporaryType needs to be:
-  // result_of_adaptable_function<BinaryFunction>::type
-
   using namespace thrust::detail;
 
-  typedef typename eval_if<
-    has_result_type<BinaryFunction>::value,
-    result_type<BinaryFunction>,
-    eval_if<
-      is_output_iterator<OutputIterator>::value,
-      thrust::iterator_value<InputIterator>,
-      thrust::iterator_value<OutputIterator>
-    >
-  >::type ValueType;
+  // Use the initial value type per https://wg21.link/P0571
+  using ValueType = InitialValueType;
 
   if(first != last)
   {
@@ -156,5 +117,5 @@ __host__ __device__
 } // end namespace sequential
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/sequential/scan_by_key.h b/thrust/system/detail/sequential/scan_by_key.h
index 1e0471b37..c428c1050 100644
--- a/thrust/system/detail/sequential/scan_by_key.h
+++ b/thrust/system/detail/sequential/scan_by_key.h
@@ -26,8 +26,7 @@
 #include <thrust/detail/function.h>
 #include <thrust/system/detail/sequential/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -52,8 +51,8 @@ __host__ __device__
                                        BinaryPredicate binary_pred,
                                        BinaryFunction binary_op)
 {
-  typedef typename thrust::iterator_traits<InputIterator1>::value_type KeyType;
-  typedef typename thrust::iterator_traits<OutputIterator>::value_type ValueType;
+  using KeyType = typename thrust::iterator_traits<InputIterator1>::value_type;
+  using ValueType = typename thrust::iterator_traits<InputIterator2>::value_type;
 
   // wrap binary_op
   thrust::detail::wrapped_function<
@@ -105,8 +104,8 @@ __host__ __device__
                                        BinaryPredicate binary_pred,
                                        BinaryFunction binary_op)
 {
-  typedef typename thrust::iterator_traits<InputIterator1>::value_type KeyType;
-  typedef typename thrust::iterator_traits<OutputIterator>::value_type ValueType;
+  using KeyType = typename thrust::iterator_traits<InputIterator1>::value_type;
+  using ValueType = T;
 
   if(first1 != last1)
   {
@@ -146,5 +145,5 @@ __host__ __device__
 } // end namespace sequential
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/sequential/set_operations.h b/thrust/system/detail/sequential/set_operations.h
index a9b1cc688..678754b45 100644
--- a/thrust/system/detail/sequential/set_operations.h
+++ b/thrust/system/detail/sequential/set_operations.h
@@ -26,8 +26,7 @@
 #include <thrust/detail/copy.h>
 #include <thrust/detail/function.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -220,5 +219,5 @@ __host__ __device__
 } // end namespace sequential
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/sequential/sort.h b/thrust/system/detail/sequential/sort.h
index 0900743d8..34cc7a8ba 100644
--- a/thrust/system/detail/sequential/sort.h
+++ b/thrust/system/detail/sequential/sort.h
@@ -23,8 +23,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/sequential/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -58,7 +57,7 @@ void stable_sort_by_key(sequential::execution_policy<DerivedPolicy> &exec,
 } // end namespace sequential
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/sequential/sort.inl>
 
diff --git a/thrust/system/detail/sequential/sort.inl b/thrust/system/detail/sequential/sort.inl
index bbc18a0b2..241a860af 100644
--- a/thrust/system/detail/sequential/sort.inl
+++ b/thrust/system/detail/sequential/sort.inl
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2021 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
  *  limitations under the License.
  */
 
+#pragma once
+
+#include <thrust/detail/config.h>
 
 #include <thrust/reverse.h>
 #include <thrust/detail/type_traits.h>
@@ -21,8 +24,9 @@
 #include <thrust/system/detail/sequential/stable_merge_sort.h>
 #include <thrust/system/detail/sequential/stable_primitive_sort.h>
 
-namespace thrust
-{
+#include <nv/target>
+
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -58,7 +62,7 @@ void stable_sort(sequential::execution_policy<DerivedPolicy> &exec,
                  thrust::detail::true_type)
 {
   thrust::system::detail::sequential::stable_primitive_sort(exec, first, last);
-        
+
   // if comp is greater<T> then reverse the keys
   typedef typename thrust::iterator_traits<RandomAccessIterator>::value_type KeyType;
 
@@ -162,14 +166,14 @@ void stable_sort(sequential::execution_policy<DerivedPolicy> &exec,
 {
 
   // the compilation time of stable_primitive_sort is too expensive to use within a single CUDA thread
-#ifndef __CUDA_ARCH__
-  typedef typename thrust::iterator_traits<RandomAccessIterator>::value_type KeyType;
-  sort_detail::use_primitive_sort<KeyType,StrictWeakOrdering> use_primitive_sort;
-#else
-  thrust::detail::false_type use_primitive_sort;
-#endif
-
-  sort_detail::stable_sort(exec, first, last, comp, use_primitive_sort);
+  NV_IF_TARGET(NV_IS_HOST, (
+    using KeyType = thrust::iterator_value_t<RandomAccessIterator>;
+    sort_detail::use_primitive_sort<KeyType, StrictWeakOrdering> use_primitive_sort;
+    sort_detail::stable_sort(exec, first, last, comp, use_primitive_sort);
+  ), ( // NV_IS_DEVICE:
+    thrust::detail::false_type use_primitive_sort;
+    sort_detail::stable_sort(exec, first, last, comp, use_primitive_sort);
+  ));
 }
 
 
@@ -186,19 +190,19 @@ void stable_sort_by_key(sequential::execution_policy<DerivedPolicy> &exec,
 {
 
   // the compilation time of stable_primitive_sort_by_key is too expensive to use within a single CUDA thread
-#ifndef __CUDA_ARCH__
-  typedef typename thrust::iterator_traits<RandomAccessIterator1>::value_type KeyType;
-  sort_detail::use_primitive_sort<KeyType,StrictWeakOrdering> use_primitive_sort;
-#else
-  thrust::detail::false_type use_primitive_sort;
-#endif
-
-  sort_detail::stable_sort_by_key(exec, first1, last1, first2, comp, use_primitive_sort);
+  NV_IF_TARGET(NV_IS_HOST, (
+    using KeyType = thrust::iterator_value_t<RandomAccessIterator1>;
+    sort_detail::use_primitive_sort<KeyType, StrictWeakOrdering> use_primitive_sort;
+    sort_detail::stable_sort_by_key(exec, first1, last1, first2, comp, use_primitive_sort);
+  ), ( // NV_IS_DEVICE:
+    thrust::detail::false_type use_primitive_sort;
+    sort_detail::stable_sort_by_key(exec, first1, last1, first2, comp, use_primitive_sort);
+  ));
 }
 
 
 } // end namespace sequential
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/sequential/stable_merge_sort.h b/thrust/system/detail/sequential/stable_merge_sort.h
index 359ba8d7b..64aa2bf96 100644
--- a/thrust/system/detail/sequential/stable_merge_sort.h
+++ b/thrust/system/detail/sequential/stable_merge_sort.h
@@ -19,8 +19,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/sequential/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -54,7 +53,7 @@ void stable_merge_sort_by_key(sequential::execution_policy<DerivedPolicy> &exec,
 } // end namespace sequential
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/sequential/stable_merge_sort.inl>
 
diff --git a/thrust/system/detail/sequential/stable_merge_sort.inl b/thrust/system/detail/sequential/stable_merge_sort.inl
index 8ba3bf908..02f384afb 100644
--- a/thrust/system/detail/sequential/stable_merge_sort.inl
+++ b/thrust/system/detail/sequential/stable_merge_sort.inl
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2021 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
  *  limitations under the License.
  */
 
+#pragma once
+
+#include <thrust/detail/config.h>
 
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/detail/temporary_array.h>
@@ -21,8 +24,9 @@
 #include <thrust/system/detail/sequential/insertion_sort.h>
 #include <thrust/detail/minmax.h>
 
-namespace thrust
-{
+#include <nv/target>
+
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -97,7 +101,7 @@ void insertion_sort_each(RandomAccessIterator first,
   {
     for(; first < last; first += partition_size)
     {
-      RandomAccessIterator partition_last = thrust::min(last, first + partition_size);
+      RandomAccessIterator partition_last = (thrust::min)(last, first + partition_size);
 
       thrust::system::detail::sequential::insertion_sort(first, partition_last, comp);
     } // end for
@@ -120,7 +124,7 @@ void insertion_sort_each_by_key(RandomAccessIterator1 keys_first,
   {
     for(; keys_first < keys_last; keys_first += partition_size, values_first += partition_size)
     {
-      RandomAccessIterator1 keys_partition_last = thrust::min(keys_last, keys_first + partition_size);
+      RandomAccessIterator1 keys_partition_last = (thrust::min)(keys_last, keys_first + partition_size);
 
       thrust::system::detail::sequential::insertion_sort_by_key(keys_first, keys_partition_last, values_first, comp);
     } // end for
@@ -143,8 +147,8 @@ void merge_adjacent_partitions(sequential::execution_policy<DerivedPolicy> &exec
 {
   for(; first < last; first += 2 * partition_size, result += 2 * partition_size)
   {
-    RandomAccessIterator1 interval_middle = thrust::min(last, first + partition_size);
-    RandomAccessIterator1 interval_last   = thrust::min(last, interval_middle + partition_size);
+    RandomAccessIterator1 interval_middle = (thrust::min)(last, first + partition_size);
+    RandomAccessIterator1 interval_last   = (thrust::min)(last, interval_middle + partition_size);
 
     thrust::merge(exec,
                   first, interval_middle,
@@ -178,8 +182,8 @@ void merge_adjacent_partitions_by_key(sequential::execution_policy<DerivedPolicy
       keys_first < keys_last;
       keys_first += stride, values_first += stride, keys_result += stride, values_result += stride)
   {
-    RandomAccessIterator1 keys_interval_middle = thrust::min(keys_last, keys_first + partition_size);
-    RandomAccessIterator1 keys_interval_last   = thrust::min(keys_last, keys_interval_middle + partition_size);
+    RandomAccessIterator1 keys_interval_middle = (thrust::min)(keys_last, keys_first + partition_size);
+    RandomAccessIterator1 keys_interval_last   = (thrust::min)(keys_last, keys_interval_middle + partition_size);
 
     RandomAccessIterator2 values_first2 = values_first + (keys_interval_middle - keys_first);
 
@@ -353,12 +357,12 @@ void stable_merge_sort(sequential::execution_policy<DerivedPolicy> &exec,
                        RandomAccessIterator last,
                        StrictWeakOrdering comp)
 {
-  // avoid recursion in CUDA threads
-#ifdef __CUDA_ARCH__
-  stable_merge_sort_detail::iterative_stable_merge_sort(exec, first, last, comp);
-#else
-  stable_merge_sort_detail::recursive_stable_merge_sort(exec, first, last, comp);
-#endif
+  NV_IF_TARGET(NV_IS_DEVICE, (
+    // avoid recursion in CUDA threads
+    stable_merge_sort_detail::iterative_stable_merge_sort(exec, first, last, comp);
+  ), (
+    stable_merge_sort_detail::recursive_stable_merge_sort(exec, first, last, comp);
+  ));
 }
 
 
@@ -373,17 +377,17 @@ void stable_merge_sort_by_key(sequential::execution_policy<DerivedPolicy> &exec,
                               RandomAccessIterator2 first2,
                               StrictWeakOrdering comp)
 {
-  // avoid recursion in CUDA threads
-#ifdef __CUDA_ARCH__
-  stable_merge_sort_detail::iterative_stable_merge_sort_by_key(exec, first1, last1, first2, comp);
-#else
-  stable_merge_sort_detail::recursive_stable_merge_sort_by_key(exec, first1, last1, first2, comp);
-#endif
+  NV_IF_TARGET(NV_IS_DEVICE, (
+    // avoid recursion in CUDA threads
+    stable_merge_sort_detail::iterative_stable_merge_sort_by_key(exec, first1, last1, first2, comp);
+  ), (
+    stable_merge_sort_detail::recursive_stable_merge_sort_by_key(exec, first1, last1, first2, comp);
+  ));
 }
 
 
 } // end namespace sequential
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/sequential/stable_primitive_sort.h b/thrust/system/detail/sequential/stable_primitive_sort.h
index 3426f953a..acbb81217 100644
--- a/thrust/system/detail/sequential/stable_primitive_sort.h
+++ b/thrust/system/detail/sequential/stable_primitive_sort.h
@@ -19,8 +19,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/sequential/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -50,7 +49,7 @@ void stable_primitive_sort_by_key(sequential::execution_policy<DerivedPolicy> &e
 } // end namespace sequential
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/sequential/stable_primitive_sort.inl>
 
diff --git a/thrust/system/detail/sequential/stable_primitive_sort.inl b/thrust/system/detail/sequential/stable_primitive_sort.inl
index e5cea4ad3..9897d6798 100644
--- a/thrust/system/detail/sequential/stable_primitive_sort.inl
+++ b/thrust/system/detail/sequential/stable_primitive_sort.inl
@@ -24,8 +24,7 @@
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/detail/type_traits.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -157,5 +156,5 @@ void stable_primitive_sort_by_key(sequential::execution_policy<DerivedPolicy> &e
 } // end namespace sequential
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/sequential/stable_radix_sort.h b/thrust/system/detail/sequential/stable_radix_sort.h
index 9f7482ccf..1e9713a2c 100644
--- a/thrust/system/detail/sequential/stable_radix_sort.h
+++ b/thrust/system/detail/sequential/stable_radix_sort.h
@@ -19,8 +19,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/sequential/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -50,7 +49,7 @@ void stable_radix_sort_by_key(sequential::execution_policy<DerivedPolicy> &exec,
 } // end namespace sequential
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/sequential/stable_radix_sort.inl>
 
diff --git a/thrust/system/detail/sequential/stable_radix_sort.inl b/thrust/system/detail/sequential/stable_radix_sort.inl
index 77202bda4..83d95ebfd 100644
--- a/thrust/system/detail/sequential/stable_radix_sort.inl
+++ b/thrust/system/detail/sequential/stable_radix_sort.inl
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2021 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -14,8 +14,9 @@
  *  limitations under the License.
  */
 
+#pragma once
 
-#include <limits>
+#include <thrust/detail/config.h>
 
 #include <thrust/copy.h>
 #include <thrust/functional.h>
@@ -26,8 +27,9 @@
 #include <thrust/detail/cstdint.h>
 #include <thrust/scatter.h>
 
-namespace thrust
-{
+#include <limits>
+
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -51,7 +53,7 @@ struct RadixEncoder<char> : public thrust::unary_function<char, unsigned char>
   {
     if(std::numeric_limits<char>::is_signed)
     {
-      return x ^ static_cast<unsigned char>(1) << (8 * sizeof(unsigned char) - 1);
+      return static_cast<unsigned char>(x) ^ static_cast<unsigned char>(1) << (8 * sizeof(unsigned char) - 1);
     }
     else
     {
@@ -66,7 +68,7 @@ struct RadixEncoder<signed char> : public thrust::unary_function<signed char, un
   __host__ __device__
   unsigned char operator()(signed char x) const
   {
-    return x ^ static_cast<unsigned char>(1) << (8 * sizeof(unsigned char) - 1);
+    return static_cast<unsigned char>(x) ^ static_cast<unsigned char>(1) << (8 * sizeof(unsigned char) - 1);
   }
 };
 
@@ -76,7 +78,7 @@ struct RadixEncoder<short> : public thrust::unary_function<short, unsigned short
   __host__ __device__
   unsigned short operator()(short x) const
   {
-    return x ^ static_cast<unsigned short>(1) << (8 * sizeof(unsigned short) - 1);
+    return static_cast<unsigned short>(x) ^ static_cast<unsigned short>(1) << (8 * sizeof(unsigned short) - 1);
   }
 };
 
@@ -242,9 +244,9 @@ void radix_sort(sequential::execution_policy<DerivedPolicy> &exec,
 
   const unsigned int NumHistograms = (8 * sizeof(EncodedType) + (RadixBits - 1)) / RadixBits;
   const unsigned int HistogramSize =  1 << RadixBits;
-  
+
   const EncodedType BitMask = static_cast<EncodedType>((1 << RadixBits) - 1);
-  
+
   Encoder encode;
 
   // storage for histograms
@@ -252,10 +254,10 @@ void radix_sort(sequential::execution_policy<DerivedPolicy> &exec,
 
   // see which passes can be eliminated
   bool skip_shuffle[NumHistograms] = {false};
-  
+
   // false if most recent data is stored in (keys1,vals1)
   bool flip = false;
-    
+
   // compute histograms
   for(size_t i = 0; i < N; i++)
   {
@@ -263,7 +265,7 @@ void radix_sort(sequential::execution_policy<DerivedPolicy> &exec,
 
     for(unsigned int j = 0; j < NumHistograms; j++)
     {
-      const EncodedType BitShift = RadixBits * j;
+      const auto BitShift = static_cast<EncodedType>(RadixBits * j);
       histograms[j][(x >> BitShift) & BitMask]++;
     }
   }
@@ -286,7 +288,7 @@ void radix_sort(sequential::execution_policy<DerivedPolicy> &exec,
     }
   }
 
-  // shuffle keys and (optionally) values 
+  // shuffle keys and (optionally) values
   for(unsigned int i = 0; i < NumHistograms; i++)
   {
     const EncodedType BitShift = static_cast<EncodedType>(RadixBits * i);
@@ -315,11 +317,11 @@ void radix_sort(sequential::execution_policy<DerivedPolicy> &exec,
           radix_shuffle_n<RadixBits>(exec, keys1, N, keys2, BitShift, histograms[i]);
         }
       }
-        
+
       flip = (flip) ? false : true;
     }
   }
- 
+
   // ensure final values are in (keys1,vals1)
   if(flip)
   {
@@ -560,9 +562,9 @@ void stable_radix_sort(sequential::execution_policy<DerivedPolicy> &exec,
   typedef typename thrust::iterator_value<RandomAccessIterator>::type KeyType;
 
   size_t N = last - first;
-  
+
   thrust::detail::temporary_array<KeyType, DerivedPolicy> temp(exec, N);
-  
+
   radix_sort_detail::radix_sort(exec, first, temp.begin(), N);
 }
 
@@ -580,7 +582,7 @@ void stable_radix_sort_by_key(sequential::execution_policy<DerivedPolicy> &exec,
   typedef typename thrust::iterator_value<RandomAccessIterator2>::type ValueType;
 
   size_t N = last1 - first1;
-  
+
   thrust::detail::temporary_array<KeyType, DerivedPolicy>   temp1(exec, N);
   thrust::detail::temporary_array<ValueType, DerivedPolicy> temp2(exec, N);
 
@@ -591,5 +593,5 @@ void stable_radix_sort_by_key(sequential::execution_policy<DerivedPolicy> &exec,
 } // end namespace sequential
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/sequential/trivial_copy.h b/thrust/system/detail/sequential/trivial_copy.h
index 77bf6dd42..ea55c8fd2 100644
--- a/thrust/system/detail/sequential/trivial_copy.h
+++ b/thrust/system/detail/sequential/trivial_copy.h
@@ -24,8 +24,9 @@
 #include <cstring>
 #include <thrust/system/detail/sequential/general_copy.h>
 
-namespace thrust
-{
+#include <nv/target>
+
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -40,17 +41,21 @@ __host__ __device__
                     std::ptrdiff_t n,
                     T *result)
 {
-#ifndef __CUDA_ARCH__
-  std::memmove(result, first, n * sizeof(T));
-  return result + n;
-#else
-  return thrust::system::detail::sequential::general_copy_n(first, n, result);
-#endif
+  T* return_value = NULL;
+
+  NV_IF_TARGET(NV_IS_HOST, (
+    std::memmove(result, first, n * sizeof(T));
+    return_value = result + n;
+  ), ( // NV_IS_DEVICE:
+    return_value = thrust::system::detail::sequential::general_copy_n(first, n, result);
+  ));
+
+  return return_value;
 } // end trivial_copy_n()
 
 
 } // end namespace sequential
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/sequential/unique.h b/thrust/system/detail/sequential/unique.h
index 11168f0b4..c4fe5268a 100644
--- a/thrust/system/detail/sequential/unique.h
+++ b/thrust/system/detail/sequential/unique.h
@@ -26,8 +26,7 @@
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/pair.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -90,8 +89,42 @@ __host__ __device__
 } // end unique()
 
 
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename BinaryPredicate>
+__host__ __device__
+  typename thrust::iterator_traits<ForwardIterator>::difference_type
+    unique_count(sequential::execution_policy<DerivedPolicy> &,
+                 ForwardIterator first,
+                 ForwardIterator last,
+                 BinaryPredicate binary_pred)
+{
+  typedef typename thrust::iterator_traits<ForwardIterator>::value_type T;
+  typename thrust::iterator_traits<ForwardIterator>::difference_type count{};
+
+  if(first != last)
+  {
+    count++;
+    T prev = *first;
+
+    for(++first; first != last; ++first)
+    {
+      T temp = *first;
+
+      if (!binary_pred(prev, temp))
+      {
+        count++;
+        prev = temp;
+      }
+    }
+  }
+
+  return count;
+} // end unique()
+
+
 } // end namespace sequential
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/sequential/unique_by_key.h b/thrust/system/detail/sequential/unique_by_key.h
index 899ce02db..d30cc7c71 100644
--- a/thrust/system/detail/sequential/unique_by_key.h
+++ b/thrust/system/detail/sequential/unique_by_key.h
@@ -26,8 +26,7 @@
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/pair.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -112,5 +111,5 @@ __host__ __device__
 } // end namespace sequential
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/system_error.inl b/thrust/system/detail/system_error.inl
index 3e59458aa..075fe88e4 100644
--- a/thrust/system/detail/system_error.inl
+++ b/thrust/system/detail/system_error.inl
@@ -17,10 +17,11 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/system/system_error.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace system
 {
@@ -75,14 +76,14 @@ system_error
 
 
 const error_code &system_error
-  ::code(void) const throw()
+  ::code(void) const noexcept
 {
   return m_error_code;
 } // end system_error::code()
 
 
 const char *system_error
-  ::what(void) const throw()
+  ::what(void) const noexcept
 {
   if(m_what.empty())
   {
@@ -107,5 +108,5 @@ const char *system_error
 
 } // end system
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/error_code.h b/thrust/system/error_code.h
index faa81bbca..d460a315b 100644
--- a/thrust/system/error_code.h
+++ b/thrust/system/error_code.h
@@ -27,8 +27,7 @@
 #include <thrust/system/detail/errno.h>
 #include <iostream>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace system
 {
@@ -515,7 +514,7 @@ namespace errc = system::errc;
 using system::generic_category;
 using system::system_category;
 
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/error_category.inl>
 #include <thrust/system/detail/error_code.inl>
diff --git a/thrust/system/omp/detail/adjacent_difference.h b/thrust/system/omp/detail/adjacent_difference.h
index 7f314eaeb..622ee61ba 100644
--- a/thrust/system/omp/detail/adjacent_difference.h
+++ b/thrust/system/omp/detail/adjacent_difference.h
@@ -20,8 +20,7 @@
 #include <thrust/system/omp/detail/execution_policy.h>
 #include <thrust/system/detail/generic/adjacent_difference.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace omp
@@ -46,5 +45,5 @@ template<typename DerivedPolicy,
 } // end detail
 } // end omp
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/omp/detail/binary_search.h b/thrust/system/omp/detail/binary_search.h
index 37ff8fab5..1ed700bd8 100644
--- a/thrust/system/omp/detail/binary_search.h
+++ b/thrust/system/omp/detail/binary_search.h
@@ -20,8 +20,7 @@
 #include <thrust/system/omp/detail/execution_policy.h>
 #include <thrust/system/detail/generic/binary_search.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace omp
@@ -69,5 +68,5 @@ bool binary_search(execution_policy<DerivedPolicy> &exec,
 } // end detail
 } // end omp
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/omp/detail/copy.h b/thrust/system/omp/detail/copy.h
index e2b6661e8..ae7b1eed7 100644
--- a/thrust/system/omp/detail/copy.h
+++ b/thrust/system/omp/detail/copy.h
@@ -19,8 +19,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/omp/detail/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace omp
@@ -51,7 +50,7 @@ OutputIterator copy_n(execution_policy<DerivedPolicy> &exec,
 } // end namespace detail
 } // end namespace omp
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/omp/detail/copy.inl>
 
diff --git a/thrust/system/omp/detail/copy.inl b/thrust/system/omp/detail/copy.inl
index 4d104e5ec..47f606dda 100644
--- a/thrust/system/omp/detail/copy.inl
+++ b/thrust/system/omp/detail/copy.inl
@@ -23,8 +23,7 @@
 #include <thrust/detail/type_traits/minimum_type.h>
 
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace omp
@@ -133,5 +132,5 @@ OutputIterator copy_n(execution_policy<DerivedPolicy> &exec,
 } // end namespace detail
 } // end namespace omp
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/omp/detail/copy_if.h b/thrust/system/omp/detail/copy_if.h
index a5c28704d..b33fd96df 100644
--- a/thrust/system/omp/detail/copy_if.h
+++ b/thrust/system/omp/detail/copy_if.h
@@ -19,8 +19,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/omp/detail/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace omp
@@ -45,7 +44,7 @@ template<typename DerivedPolicy,
 } // end detail
 } // end omp
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/omp/detail/copy_if.inl>
 
diff --git a/thrust/system/omp/detail/copy_if.inl b/thrust/system/omp/detail/copy_if.inl
index 7f2516a74..8e597d4fc 100644
--- a/thrust/system/omp/detail/copy_if.inl
+++ b/thrust/system/omp/detail/copy_if.inl
@@ -20,8 +20,7 @@
 #include <thrust/system/omp/detail/copy_if.h>
 #include <thrust/system/detail/generic/copy_if.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace omp
@@ -50,5 +49,5 @@ template<typename DerivedPolicy,
 } // end detail
 } // end omp
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/omp/detail/default_decomposition.h b/thrust/system/omp/detail/default_decomposition.h
index cb4b03c71..2fe0a24fd 100644
--- a/thrust/system/omp/detail/default_decomposition.h
+++ b/thrust/system/omp/detail/default_decomposition.h
@@ -24,8 +24,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/internal/decompose.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace omp
@@ -39,7 +38,7 @@ thrust::system::detail::internal::uniform_decomposition<IndexType> default_decom
 } // end namespace detail
 } // end namespace omp
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/omp/detail/default_decomposition.inl>
 
diff --git a/thrust/system/omp/detail/default_decomposition.inl b/thrust/system/omp/detail/default_decomposition.inl
index 53f4b428f..0698d53fb 100644
--- a/thrust/system/omp/detail/default_decomposition.inl
+++ b/thrust/system/omp/detail/default_decomposition.inl
@@ -14,6 +14,8 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/system/omp/detail/default_decomposition.h>
 
@@ -22,8 +24,7 @@
 #include <omp.h>
 #endif // omp support
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace omp
@@ -56,5 +57,5 @@ thrust::system::detail::internal::uniform_decomposition<IndexType> default_decom
 } // end namespace detail
 } // end namespace omp
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/omp/detail/execution_policy.h b/thrust/system/omp/detail/execution_policy.h
index 52c879a16..f9b45312b 100644
--- a/thrust/system/omp/detail/execution_policy.h
+++ b/thrust/system/omp/detail/execution_policy.h
@@ -22,8 +22,7 @@
 #include <thrust/iterator/detail/any_system_tag.h>
 #include <thrust/detail/type_traits.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 // put the canonical tag in the same ns as the backend's entry points
@@ -103,5 +102,5 @@ using thrust::system::omp::execution_policy;
 using thrust::system::omp::tag;
 
 } // end omp
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/omp/detail/extrema.h b/thrust/system/omp/detail/extrema.h
index 96661180d..bde4e5f80 100644
--- a/thrust/system/omp/detail/extrema.h
+++ b/thrust/system/omp/detail/extrema.h
@@ -20,8 +20,7 @@
 #include <thrust/system/omp/detail/execution_policy.h>
 #include <thrust/system/detail/generic/extrema.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace omp
@@ -62,6 +61,6 @@ thrust::pair<ForwardIterator,ForwardIterator> minmax_element(execution_policy<De
 } // end detail
 } // end omp
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
 
diff --git a/thrust/system/omp/detail/find.h b/thrust/system/omp/detail/find.h
index e6445c068..d2abac95e 100644
--- a/thrust/system/omp/detail/find.h
+++ b/thrust/system/omp/detail/find.h
@@ -25,8 +25,7 @@
 #include <thrust/system/detail/generic/find.h>
 #include <thrust/system/omp/detail/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace omp
@@ -47,5 +46,5 @@ InputIterator find_if(execution_policy<DerivedPolicy> &exec,
 } // end namespace detail
 } // end namespace omp
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/omp/detail/for_each.h b/thrust/system/omp/detail/for_each.h
index 4e6955ea2..a2030f374 100644
--- a/thrust/system/omp/detail/for_each.h
+++ b/thrust/system/omp/detail/for_each.h
@@ -25,8 +25,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/omp/detail/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace omp
@@ -54,7 +53,7 @@ template<typename DerivedPolicy,
 } // end namespace detail
 } // end namespace omp
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/omp/detail/for_each.inl>
 
diff --git a/thrust/system/omp/detail/for_each.inl b/thrust/system/omp/detail/for_each.inl
index 6be6435e6..4246d5380 100644
--- a/thrust/system/omp/detail/for_each.inl
+++ b/thrust/system/omp/detail/for_each.inl
@@ -14,21 +14,17 @@
  *  limitations under the License.
  */
 
-
-/*! \file for_each.inl
- *  \brief Inline file for for_each.h.
- */
+#pragma once
 
 #include <thrust/detail/config.h>
-#include <thrust/detail/static_assert.h>
-#include <thrust/distance.h>
 #include <thrust/detail/function.h>
-#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/static_assert.h>
 #include <thrust/distance.h>
 #include <thrust/for_each.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/omp/detail/pragma_omp.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace omp
@@ -62,14 +58,11 @@ RandomAccessIterator for_each_n(execution_policy<DerivedPolicy> &,
   // create a wrapped function for f
   thrust::detail::wrapped_function<UnaryFunction,void> wrapped_f(f);
 
-// do not attempt to compile the body of this function, which depends on #pragma omp,
-// without support from the compiler
-// XXX implement the body of this function in another file to eliminate this ugliness
-#if (THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE == THRUST_TRUE)
   // use a signed type for the iteration variable or suffer the consequences of warnings
   typedef typename thrust::iterator_difference<RandomAccessIterator>::type DifferenceType;
   DifferenceType signed_n = n;
-#pragma omp parallel for
+
+  THRUST_PRAGMA_OMP(parallel for)
   for(DifferenceType i = 0;
       i < signed_n;
       ++i)
@@ -77,10 +70,9 @@ RandomAccessIterator for_each_n(execution_policy<DerivedPolicy> &,
     RandomAccessIterator temp = first + i;
     wrapped_f(*temp);
   }
-#endif // THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE
 
   return first + n;
-} // end for_each_n() 
+} // end for_each_n()
 
 template<typename DerivedPolicy,
          typename RandomAccessIterator,
@@ -96,5 +88,5 @@ template<typename DerivedPolicy,
 } // end namespace detail
 } // end namespace omp
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/omp/detail/memory.inl b/thrust/system/omp/detail/memory.inl
index 331ba5cab..db9b4f07b 100644
--- a/thrust/system/omp/detail/memory.inl
+++ b/thrust/system/omp/detail/memory.inl
@@ -14,14 +14,16 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/system/cpp/detail/execution_policy.h>
 #include <thrust/system/omp/memory.h>
 #include <thrust/system/cpp/memory.h>
+
 #include <limits>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace omp
@@ -80,5 +82,5 @@ inline void free(pointer<void> ptr)
 
 } // end omp
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/omp/detail/par.h b/thrust/system/omp/detail/par.h
index 74c948696..b81a5d489 100644
--- a/thrust/system/omp/detail/par.h
+++ b/thrust/system/omp/detail/par.h
@@ -20,8 +20,7 @@
 #include <thrust/detail/allocator_aware_execution_policy.h>
 #include <thrust/system/omp/detail/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace omp
@@ -35,7 +34,7 @@ struct par_t : thrust::system::omp::detail::execution_policy<par_t>,
     thrust::system::omp::detail::execution_policy>
 {
   __host__ __device__
-  par_t() : thrust::system::omp::detail::execution_policy<par_t>() {}
+  constexpr par_t() : thrust::system::omp::detail::execution_policy<par_t>() {}
 };
 
 
@@ -58,5 +57,5 @@ using thrust::system::omp::par;
 
 
 } // end omp
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/omp/detail/partition.h b/thrust/system/omp/detail/partition.h
index 64a76e278..7a6f4a934 100644
--- a/thrust/system/omp/detail/partition.h
+++ b/thrust/system/omp/detail/partition.h
@@ -25,8 +25,7 @@
 #include <thrust/system/omp/detail/execution_policy.h>
 #include <thrust/pair.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace omp
@@ -85,7 +84,7 @@ template<typename DerivedPolicy,
 } // end namespace detail
 } // end namespace omp
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/omp/detail/partition.inl>
 
diff --git a/thrust/system/omp/detail/partition.inl b/thrust/system/omp/detail/partition.inl
index b81c17cbf..ba0a09eaf 100644
--- a/thrust/system/omp/detail/partition.inl
+++ b/thrust/system/omp/detail/partition.inl
@@ -25,8 +25,7 @@
 #include <thrust/system/omp/detail/partition.h>
 #include <thrust/system/detail/generic/partition.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace omp
@@ -104,5 +103,5 @@ template<typename DerivedPolicy,
 } // end namespace detail
 } // end namespace omp
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/omp/detail/pointer.inl b/thrust/system/omp/detail/pointer.inl
deleted file mode 100644
index 2125302e4..000000000
--- a/thrust/system/omp/detail/pointer.inl
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- *  Copyright 2008-2018 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-namespace thrust
-{
-namespace system
-{
-namespace omp
-{
-
-
-template<typename T>
-  template<typename OtherT>
-    reference<T> &
-      reference<T>
-        ::operator=(const reference<OtherT> &other)
-{
-  return super_t::operator=(other);
-} // end reference::operator=()
-
-template<typename T>
-  reference<T> &
-    reference<T>
-      ::operator=(const value_type &x)
-{
-  return super_t::operator=(x);
-} // end reference::operator=()
-
-template<typename T>
-__host__ __device__
-void swap(reference<T> a, reference<T> b)
-{
-  a.swap(b);
-} // end swap()
-
-} // end omp
-} // end system
-} // end thrust
-
diff --git a/thrust/system/omp/detail/pragma_omp.h b/thrust/system/omp/detail/pragma_omp.h
new file mode 100644
index 000000000..a8eeae234
--- /dev/null
+++ b/thrust/system/omp/detail/pragma_omp.h
@@ -0,0 +1,56 @@
+/******************************************************************************
+* Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions are met:
+*     * Redistributions of source code must retain the above copyright
+*       notice, this list of conditions and the following disclaimer.
+*     * Redistributions in binary form must reproduce the above copyright
+*       notice, this list of conditions and the following disclaimer in the
+*       documentation and/or other materials provided with the distribution.
+*     * Neither the name of the NVIDIA CORPORATION nor the
+*       names of its contributors may be used to endorse or promote products
+*       derived from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+******************************************************************************/
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+#if THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC
+// MSVC ICEs when using the standard C++11 `_Pragma` operator with OpenMP
+// directives.
+// WAR this by using the MSVC-extension `__pragma`. See this link for more info:
+// https://developercommunity.visualstudio.com/t/Using-C11s-_Pragma-with-OpenMP-dire/1590628
+#define THRUST_PRAGMA_OMP_IMPL(directive) __pragma(directive)
+#else // Not MSVC:
+#define THRUST_PRAGMA_OMP_IMPL(directive) _Pragma(#directive)
+#endif
+
+// For internal use only -- THRUST_PRAGMA_OMP is used to switch between
+// different flavors of openmp pragmas. Pragmas are not emitted when OpenMP is
+// not available.
+//
+// Usage:
+//   Replace: #pragma omp parallel for
+//   With   : THRUST_PRAGMA_OMP(parallel for)
+//
+#if defined(_NVHPC_STDPAR_OPENMP) && _NVHPC_STDPAR_OPENMP == 1
+#define THRUST_PRAGMA_OMP(directive) THRUST_PRAGMA_OMP_IMPL(omp_stdpar directive)
+#elif defined(_OPENMP)
+#define THRUST_PRAGMA_OMP(directive) THRUST_PRAGMA_OMP_IMPL(omp directive)
+#else
+#define THRUST_PRAGMA_OMP(directive)
+#endif
diff --git a/thrust/system/omp/detail/reduce.h b/thrust/system/omp/detail/reduce.h
index c058e05db..5e5f2106e 100644
--- a/thrust/system/omp/detail/reduce.h
+++ b/thrust/system/omp/detail/reduce.h
@@ -24,8 +24,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/omp/detail/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace omp
@@ -48,7 +47,7 @@ template<typename DerivedPolicy,
 } // end namespace detail
 } // end namespace omp
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/omp/detail/reduce.inl>
 
diff --git a/thrust/system/omp/detail/reduce.inl b/thrust/system/omp/detail/reduce.inl
index 4609922a9..6a5723780 100644
--- a/thrust/system/omp/detail/reduce.inl
+++ b/thrust/system/omp/detail/reduce.inl
@@ -14,14 +14,15 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/system/omp/detail/reduce.h>
 #include <thrust/system/omp/detail/default_decomposition.h>
 #include <thrust/system/omp/detail/reduce_intervals.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace omp
@@ -31,7 +32,7 @@ namespace detail
 
 
 template<typename DerivedPolicy,
-         typename InputIterator, 
+         typename InputIterator,
          typename OutputType,
          typename BinaryFunction>
   OutputType reduce(execution_policy<DerivedPolicy> &exec,
@@ -51,10 +52,10 @@ template<typename DerivedPolicy,
   // allocate storage for the initializer and partial sums
   // XXX use select_system for Tag
   thrust::detail::temporary_array<OutputType,DerivedPolicy> partial_sums(exec, decomp1.size() + 1);
-  
+
   // set first element of temp array to init
   partial_sums[0] = init;
-  
+
   // accumulate partial sums (first level reduction)
   thrust::system::omp::detail::reduce_intervals(exec, first, partial_sums.begin() + 1, binary_op, decomp1);
 
@@ -68,5 +69,5 @@ template<typename DerivedPolicy,
 } // end detail
 } // end omp
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/omp/detail/reduce_by_key.h b/thrust/system/omp/detail/reduce_by_key.h
index 37e89ecba..005616de5 100644
--- a/thrust/system/omp/detail/reduce_by_key.h
+++ b/thrust/system/omp/detail/reduce_by_key.h
@@ -24,8 +24,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/omp/detail/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace omp
@@ -55,7 +54,7 @@ template <typename DerivedPolicy,
 } // end namespace detail
 } // end namespace omp
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/omp/detail/reduce_by_key.inl>
 
diff --git a/thrust/system/omp/detail/reduce_by_key.inl b/thrust/system/omp/detail/reduce_by_key.inl
index afd4c8e51..4088d0634 100644
--- a/thrust/system/omp/detail/reduce_by_key.inl
+++ b/thrust/system/omp/detail/reduce_by_key.inl
@@ -14,13 +14,14 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/system/omp/detail/reduce_by_key.h>
 #include <thrust/system/detail/generic/reduce_by_key.h>
 #include <thrust/distance.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace omp
@@ -37,7 +38,7 @@ template <typename DerivedPolicy,
           typename BinaryFunction>
   thrust::pair<OutputIterator1,OutputIterator2>
     reduce_by_key(execution_policy<DerivedPolicy> &exec,
-                  InputIterator1 keys_first, 
+                  InputIterator1 keys_first,
                   InputIterator1 keys_last,
                   InputIterator2 values_first,
                   OutputIterator1 keys_output,
@@ -53,5 +54,5 @@ template <typename DerivedPolicy,
 } // end detail
 } // end omp
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/omp/detail/reduce_intervals.h b/thrust/system/omp/detail/reduce_intervals.h
index 44551e645..1c69fc621 100644
--- a/thrust/system/omp/detail/reduce_intervals.h
+++ b/thrust/system/omp/detail/reduce_intervals.h
@@ -24,8 +24,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/omp/detail/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace omp
@@ -47,7 +46,7 @@ void reduce_intervals(execution_policy<DerivedPolicy> &exec,
 } // end namespace detail
 } // end namespace omp
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/omp/detail/reduce_intervals.inl>
 
diff --git a/thrust/system/omp/detail/reduce_intervals.inl b/thrust/system/omp/detail/reduce_intervals.inl
index 961f2757a..2668a7b60 100644
--- a/thrust/system/omp/detail/reduce_intervals.inl
+++ b/thrust/system/omp/detail/reduce_intervals.inl
@@ -14,6 +14,7 @@
  *  limitations under the License.
  */
 
+#pragma once
 
 #include <thrust/detail/config.h>
 #include <thrust/system/omp/detail/reduce_intervals.h>
@@ -21,8 +22,7 @@
 #include <thrust/detail/function.h>
 #include <thrust/detail/cstdint.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace omp
@@ -63,9 +63,7 @@ void reduce_intervals(execution_policy<DerivedPolicy> &,
 
   index_type n = static_cast<index_type>(decomp.size());
 
-#if (THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE == THRUST_TRUE)
-# pragma omp parallel for
-#endif // THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE
+  THRUST_PRAGMA_OMP(parallel for)
   for(index_type i = 0; i < n; i++)
   {
     InputIterator begin = input + decomp[i].begin();
@@ -93,5 +91,5 @@ void reduce_intervals(execution_policy<DerivedPolicy> &,
 } // end namespace detail
 } // end namespace omp
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/omp/detail/remove.h b/thrust/system/omp/detail/remove.h
index ca4eab845..9b2d46e75 100644
--- a/thrust/system/omp/detail/remove.h
+++ b/thrust/system/omp/detail/remove.h
@@ -19,8 +19,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/omp/detail/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace omp
@@ -75,7 +74,7 @@ template<typename DerivedPolicy,
 } // end namespace detail
 } // end namespace omp
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/omp/detail/remove.inl>
 
diff --git a/thrust/system/omp/detail/remove.inl b/thrust/system/omp/detail/remove.inl
index aa8289476..5330f1407 100644
--- a/thrust/system/omp/detail/remove.inl
+++ b/thrust/system/omp/detail/remove.inl
@@ -20,8 +20,7 @@
 #include <thrust/system/omp/detail/remove.h>
 #include <thrust/system/detail/generic/remove.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace omp
@@ -90,5 +89,5 @@ template<typename DerivedPolicy,
 } // end namespace detail
 } // end namespace omp
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/omp/detail/sort.h b/thrust/system/omp/detail/sort.h
index 339ce5b6e..cf0b8c6d6 100644
--- a/thrust/system/omp/detail/sort.h
+++ b/thrust/system/omp/detail/sort.h
@@ -19,8 +19,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/omp/detail/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace omp
@@ -49,7 +48,7 @@ void stable_sort_by_key(execution_policy<DerivedPolicy> &exec,
 } // end namespace detail
 } // end namespace omp
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/omp/detail/sort.inl>
 
diff --git a/thrust/system/omp/detail/sort.inl b/thrust/system/omp/detail/sort.inl
index 587017ca6..a0867ca4d 100644
--- a/thrust/system/omp/detail/sort.inl
+++ b/thrust/system/omp/detail/sort.inl
@@ -14,6 +14,7 @@
  *  limitations under the License.
  */
 
+#pragma once
 
 #include <thrust/detail/config.h>
 
@@ -30,8 +31,7 @@
 #include <thrust/detail/seq.h>
 #include <thrust/detail/temporary_array.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace omp
@@ -114,13 +114,14 @@ void stable_sort(execution_policy<DerivedPolicy> &exec,
   , "OpenMP compiler support is not enabled"
   );
 
+  // Avoid issues on compilers that don't provide `omp_get_num_threads()`.
 #if (THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE == THRUST_TRUE)
   typedef typename thrust::iterator_difference<RandomAccessIterator>::type IndexType;
-  
+
   if(first == last)
     return;
 
-  #pragma omp parallel
+  THRUST_PRAGMA_OMP(parallel)
   {
     thrust::system::detail::internal::uniform_decomposition<IndexType> decomp(last - first, 1, omp_get_num_threads());
 
@@ -136,7 +137,7 @@ void stable_sort(execution_policy<DerivedPolicy> &exec,
                           comp);
     }
 
-    #pragma omp barrier
+    THRUST_PRAGMA_OMP(barrier)
 
     // XXX For some reason, MSVC 2015 yields an error unless we include this meaningless semicolon here
     ;
@@ -167,7 +168,7 @@ void stable_sort(execution_policy<DerivedPolicy> &exec,
       nseg = (nseg + 1) / 2;
       h *= 2;
 
-      #pragma omp barrier
+      THRUST_PRAGMA_OMP(barrier)
     }
   }
 #endif // THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE
@@ -196,13 +197,14 @@ void stable_sort_by_key(execution_policy<DerivedPolicy> &exec,
   , "OpenMP compiler support is not enabled"
   );
 
+  // Avoid issues on compilers that don't provide `omp_get_num_threads()`.
 #if (THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE == THRUST_TRUE)
   typedef typename thrust::iterator_difference<RandomAccessIterator1>::type IndexType;
-  
+
   if(keys_first == keys_last)
     return;
 
-  #pragma omp parallel
+  THRUST_PRAGMA_OMP(parallel)
   {
     thrust::system::detail::internal::uniform_decomposition<IndexType> decomp(keys_last - keys_first, 1, omp_get_num_threads());
 
@@ -219,7 +221,7 @@ void stable_sort_by_key(execution_policy<DerivedPolicy> &exec,
                                  comp);
     }
 
-    #pragma omp barrier
+    THRUST_PRAGMA_OMP(barrier)
 
     // XXX For some reason, MSVC 2015 yields an error unless we include this meaningless semicolon here
     ;
@@ -251,7 +253,7 @@ void stable_sort_by_key(execution_policy<DerivedPolicy> &exec,
       nseg = (nseg + 1) / 2;
       h *= 2;
 
-      #pragma omp barrier
+      THRUST_PRAGMA_OMP(barrier)
     }
   }
 #endif // THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE
@@ -261,5 +263,5 @@ void stable_sort_by_key(execution_policy<DerivedPolicy> &exec,
 } // end namespace detail
 } // end namespace omp
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/omp/detail/unique.h b/thrust/system/omp/detail/unique.h
index 433e7689b..cf8025665 100644
--- a/thrust/system/omp/detail/unique.h
+++ b/thrust/system/omp/detail/unique.h
@@ -20,8 +20,7 @@
 #include <thrust/system/omp/detail/execution_policy.h>
 #include <thrust/pair.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace omp
@@ -50,10 +49,20 @@ template<typename DerivedPolicy,
                              BinaryPredicate binary_pred);
 
 
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename BinaryPredicate>
+  typename thrust::iterator_traits<ForwardIterator>::difference_type
+    unique_count(execution_policy<DerivedPolicy> &exec,
+                 ForwardIterator first,
+                 ForwardIterator last,
+                 BinaryPredicate binary_pred);
+
+
 } // end namespace detail
 } // end namespace omp 
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/omp/detail/unique.inl>
 
diff --git a/thrust/system/omp/detail/unique.inl b/thrust/system/omp/detail/unique.inl
index 70f026dbb..9a93fb135 100644
--- a/thrust/system/omp/detail/unique.inl
+++ b/thrust/system/omp/detail/unique.inl
@@ -21,8 +21,7 @@
 #include <thrust/system/detail/generic/unique.h>
 #include <thrust/pair.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace omp
@@ -59,8 +58,22 @@ template<typename DerivedPolicy,
 } // end unique_copy()
 
 
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename BinaryPredicate>
+  typename thrust::iterator_traits<ForwardIterator>::difference_type
+    unique_count(execution_policy<DerivedPolicy> &exec,
+                 ForwardIterator first,
+                 ForwardIterator last,
+                 BinaryPredicate binary_pred)
+{
+  // omp prefers generic::unique_count to cpp::unique_count
+  return thrust::system::detail::generic::unique_count(exec,first,last,binary_pred);
+} // end unique_count()
+
+
 } // end namespace detail
 } // end namespace omp 
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/omp/detail/unique_by_key.h b/thrust/system/omp/detail/unique_by_key.h
index ff3acb094..43859b64e 100644
--- a/thrust/system/omp/detail/unique_by_key.h
+++ b/thrust/system/omp/detail/unique_by_key.h
@@ -20,8 +20,7 @@
 #include <thrust/system/omp/detail/execution_policy.h>
 #include <thrust/pair.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace omp
@@ -61,7 +60,7 @@ template<typename DerivedPolicy,
 } // end namespace detail
 } // end namespace omp 
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/omp/detail/unique_by_key.inl>
 
diff --git a/thrust/system/omp/detail/unique_by_key.inl b/thrust/system/omp/detail/unique_by_key.inl
index 0a4367b7b..6610c8a00 100644
--- a/thrust/system/omp/detail/unique_by_key.inl
+++ b/thrust/system/omp/detail/unique_by_key.inl
@@ -21,8 +21,7 @@
 #include <thrust/system/detail/generic/unique_by_key.h>
 #include <thrust/pair.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace omp
@@ -70,5 +69,5 @@ template<typename DerivedPolicy,
 } // end namespace detail
 } // end namespace omp
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/omp/detail/vector.inl b/thrust/system/omp/detail/vector.inl
deleted file mode 100644
index 2dac743cb..000000000
--- a/thrust/system/omp/detail/vector.inl
+++ /dev/null
@@ -1,126 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/omp/vector.h>
-#include <utility>
-
-namespace thrust
-{
-namespace system
-{
-namespace omp
-{
-
-template<typename T, typename Allocator>
-  vector<T,Allocator>
-    ::vector()
-      : super_t()
-{}
-
-template<typename T, typename Allocator>
-  vector<T,Allocator>
-    ::vector(size_type n)
-      : super_t(n)
-{}
-
-template<typename T, typename Allocator>
-  vector<T,Allocator>
-    ::vector(size_type n, const value_type &value)
-      : super_t(n,value)
-{}
-
-template<typename T, typename Allocator>
-  vector<T,Allocator>
-    ::vector(const vector &x)
-      : super_t(x)
-{}
-
-#if __cplusplus >= 201103L
-  template<typename T, typename Allocator>
-    vector<T,Allocator>
-      ::vector(vector &&x)
-        : super_t(std::move(x))
-  {}
-#endif
-
-template<typename T, typename Allocator>
-  template<typename OtherT, typename OtherAllocator>
-    vector<T,Allocator>
-      ::vector(const thrust::detail::vector_base<OtherT,OtherAllocator> &x)
-        : super_t(x)
-{}
-
-template<typename T, typename Allocator>
-  template<typename OtherT, typename OtherAllocator>
-    vector<T,Allocator>
-      ::vector(const std::vector<OtherT,OtherAllocator> &x)
-        : super_t(x)
-{}
-
-template<typename T, typename Allocator>
-  template<typename InputIterator>
-    vector<T,Allocator>
-      ::vector(InputIterator first, InputIterator last)
-        : super_t(first,last)
-{}
-
-template<typename T, typename Allocator>
-  vector<T,Allocator> &
-    vector<T,Allocator>
-      ::operator=(const vector &x)
-{
-  super_t::operator=(x);
-  return *this;
-}
-
-#if __cplusplus >= 201103L
-  template<typename T, typename Allocator>
-    vector<T,Allocator> &
-      vector<T,Allocator>
-        ::operator=(vector &&x)
-  {
-    super_t::operator=(std::move(x));
-    return *this;
-  }
-#endif
-
-template<typename T, typename Allocator>
-  template<typename OtherT, typename OtherAllocator>
-    vector<T,Allocator> &
-      vector<T,Allocator>
-        ::operator=(const std::vector<OtherT,OtherAllocator> &x)
-{
-  super_t::operator=(x);
-  return *this;
-}
-
-template<typename T, typename Allocator>
-  template<typename OtherT, typename OtherAllocator>
-    vector<T,Allocator> &
-      vector<T,Allocator>
-        ::operator=(const thrust::detail::vector_base<OtherT,OtherAllocator> &x)
-{
-  super_t::operator=(x);
-  return *this;
-}
-      
-} // end omp
-} // end system
-} // end thrust
-
diff --git a/thrust/system/omp/execution_policy.h b/thrust/system/omp/execution_policy.h
index 8a413f7f6..c027d6be6 100644
--- a/thrust/system/omp/execution_policy.h
+++ b/thrust/system/omp/execution_policy.h
@@ -76,8 +76,7 @@
 // define these entities here for the purpose of Doxygenating them
 // they are actually defined elsewhere
 #if 0
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace omp
@@ -150,7 +149,7 @@ static const unspecified par;
 
 } // end cpp
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 #endif
 
 
diff --git a/thrust/system/omp/memory.h b/thrust/system/omp/memory.h
index 959e6c0c1..31f407c4c 100644
--- a/thrust/system/omp/memory.h
+++ b/thrust/system/omp/memory.h
@@ -27,11 +27,8 @@
 #include <thrust/mr/allocator.h>
 #include <ostream>
 
-namespace thrust
-{
-namespace system
-{
-namespace omp
+THRUST_NAMESPACE_BEGIN
+namespace system { namespace omp
 {
 
 /*! Allocates an area of memory available to Thrust's <tt>omp</tt> system.
@@ -67,83 +64,38 @@ inline pointer<T> malloc(std::size_t n);
  */
 inline void free(pointer<void> ptr);
 
-// XXX upon c++11
-// template<typename T>
-// using allocator = thrust::mr::stateless_resource_allocator<T, memory_resource>;
-
-/*! \p omp::allocator is the default allocator used by the \p omp system's containers such as
- *  <tt>omp::vector</tt> if no user-specified allocator is provided. \p omp::allocator allocates
- *  (deallocates) storage with \p omp::malloc (\p omp::free).
+/*! \p omp::allocator is the default allocator used by the \p omp system's
+ *  containers such as <tt>omp::vector</tt> if no user-specified allocator is
+ *  provided. \p omp::allocator allocates (deallocates) storage with \p
+ *  omp::malloc (\p omp::free).
  */
 template<typename T>
-  struct allocator
-    : thrust::mr::stateless_resource_allocator<
-        T,
-        memory_resource
-    >
-{
-private:
-    typedef thrust::mr::stateless_resource_allocator<
-        T,
-        memory_resource
-    > base;
-
-public:
-  /*! The \p rebind metafunction provides the type of an \p allocator
-   *  instantiated with another type.
-   *
-   *  \tparam U The other type to use for instantiation.
-   */
-  template<typename U>
-    struct rebind
-  {
-    /*! The typedef \p other gives the type of the rebound \p allocator.
-     */
-    typedef allocator<U> other;
-  };
-
-  /*! No-argument constructor has no effect.
-   */
-  __host__ __device__
-  inline allocator() {}
+using allocator = thrust::mr::stateless_resource_allocator<
+  T, thrust::system::omp::memory_resource
+>;
 
-  /*! Copy constructor has no effect.
-   */
-  __host__ __device__
-  inline allocator(const allocator & other) : base(other) {}
-
-  /*! Constructor from other \p allocator has no effect.
-   */
-  template<typename U>
-  __host__ __device__
-  inline allocator(const allocator<U> & other) : base(other) {}
-
-  /*! Destructor has no effect.
-   */
-  __host__ __device__
-  inline ~allocator() {}
-}; // end allocator
-
-} // end omp
-
-/*! \}
+/*! \p omp::universal_allocator allocates memory that can be used by the \p omp
+ *  system and host systems.
  */
+template<typename T>
+using universal_allocator = thrust::mr::stateless_resource_allocator<
+  T, thrust::system::omp::universal_memory_resource
+>;
 
-} // end system
+}} // namespace system::omp
 
 /*! \namespace thrust::omp
  *  \brief \p thrust::omp is a top-level alias for thrust::system::omp.
  */
 namespace omp
 {
-
 using thrust::system::omp::malloc;
 using thrust::system::omp::free;
 using thrust::system::omp::allocator;
+using thrust::system::omp::universal_allocator;
+} // namespace omp
 
-} // end omp
-
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/omp/detail/memory.inl>
 
diff --git a/thrust/system/omp/memory_resource.h b/thrust/system/omp/memory_resource.h
index 6a540d834..d8eed0c0f 100644
--- a/thrust/system/omp/memory_resource.h
+++ b/thrust/system/omp/memory_resource.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2018 NVIDIA Corporation
+ *  Copyright 2018-2020 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -15,7 +15,7 @@
  */
 
 /*! \file omp/memory_resource.h
- *  \brief Memory resources for the OMP system.
+ *  \brief Memory resources for the OpenMP system.
  */
 
 #pragma once
@@ -26,11 +26,8 @@
 
 #include <thrust/system/omp/pointer.h>
 
-namespace thrust
-{
-namespace system
-{
-namespace omp
+THRUST_NAMESPACE_BEGIN
+namespace system { namespace omp
 {
 
 //! \cond
@@ -40,24 +37,33 @@ namespace detail
         thrust::mr::new_delete_resource,
         thrust::omp::pointer<void>
     > native_resource;
-}
+
+    typedef thrust::mr::fancy_pointer_resource<
+        thrust::mr::new_delete_resource,
+        thrust::omp::universal_pointer<void>
+    > universal_native_resource;
+} // namespace detail
 //! \endcond
 
 /*! \addtogroup memory_resources Memory Resources
- *  \ingroup memory_management_classes
+ *  \ingroup memory_management
  *  \{
  */
 
-/*! The memory resource for the OMP system. Uses \p mr::new_delete_resource and tags it with \p omp::pointer. */
+/*! The memory resource for the OpenMP system. Uses \p mr::new_delete_resource
+ *  and tags it with \p omp::pointer.
+ */
 typedef detail::native_resource memory_resource;
-/*! An alias for \p omp::memory_resource. */
-typedef detail::native_resource universal_memory_resource;
-/*! An alias for \p omp::memory_resource. */
+/*! The unified memory resource for the OpenMP system. Uses
+ *  \p mr::new_delete_resource and tags it with \p omp::universal_pointer.
+ */
+typedef detail::universal_native_resource universal_memory_resource;
+/*! An alias for \p omp::universal_memory_resource. */
 typedef detail::native_resource universal_host_pinned_memory_resource;
 
 /*! \}
  */
 
-}
-}
-}
+}} // namespace system::omp
+
+THRUST_NAMESPACE_END
diff --git a/thrust/system/omp/pointer.h b/thrust/system/omp/pointer.h
index 36b6bed12..2be42e4fc 100644
--- a/thrust/system/omp/pointer.h
+++ b/thrust/system/omp/pointer.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2018 NVIDIA Corporation
+ *  Copyright 2008-2020 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -21,113 +21,30 @@
 #pragma once
 
 #include <thrust/detail/config.h>
+#include <type_traits>
 #include <thrust/system/omp/detail/execution_policy.h>
-#include <thrust/detail/type_traits.h>
 #include <thrust/detail/pointer.h>
 #include <thrust/detail/reference.h>
 
-namespace thrust
+THRUST_NAMESPACE_BEGIN
+namespace system { namespace omp
 {
-namespace system
-{
-namespace omp
-{
-
-template<typename> class pointer;
-
-} // end omp
-} // end system
-} // end thrust
-
-
-/*! \cond
- */
-
-// specialize thrust::iterator_traits to avoid problems with the name of
-// pointer's constructor shadowing its nested pointer type
-// do this before pointer is defined so the specialization is correctly
-// used inside the definition
-namespace thrust
-{
-
-template<typename Element>
-  struct iterator_traits<thrust::system::omp::pointer<Element> >
-{
-  private:
-    typedef thrust::system::omp::pointer<Element> ptr;
-
-  public:
-    typedef typename ptr::iterator_category       iterator_category;
-    typedef typename ptr::value_type              value_type;
-    typedef typename ptr::difference_type         difference_type;
-    typedef ptr                                   pointer;
-    typedef typename ptr::reference               reference;
-}; // end iterator_traits
-
-} // end thrust
-
-/*! \endcond
- */
-
-
-namespace thrust
-{
-namespace system
-{
-
-/*! \addtogroup system_backends Systems
- *  \ingroup system
- *  \{
- */
 
-/*! \namespace thrust::system::omp
- *  \brief \p thrust::system::omp is the namespace containing functionality for allocating, manipulating,
- *         and deallocating memory available to Thrust's OpenMP backend system.
- *         The identifiers are provided in a separate namespace underneath <tt>thrust::system</tt>
- *         for import convenience but are also aliased in the top-level <tt>thrust::omp</tt>
- *         namespace for easy access.
+/*! \p omp::pointer stores a pointer to an object allocated in memory accessible
+ *  by the \p omp system. This type provides type safety when dispatching
+ *  algorithms on ranges resident in \p omp memory.
  *
- */
-namespace omp
-{
-
-// forward declaration of reference for pointer
-template<typename Element> class reference;
-
-/*! \cond
- */
-
-// XXX nvcc + msvc have trouble instantiating reference below
-//     this is a workaround
-namespace detail
-{
-
-template<typename Element>
-  struct reference_msvc_workaround
-{
-  typedef thrust::system::omp::reference<Element> type;
-}; // end reference_msvc_workaround
-
-} // end detail
-
-/*! \endcond
- */
-
-
-/*! \p pointer stores a pointer to an object allocated in memory available to the omp system.
- *  This type provides type safety when dispatching standard algorithms on ranges resident
- *  in omp memory.
+ *  \p omp::pointer has pointer semantics: it may be dereferenced and
+ *  manipulated with pointer arithmetic.
  *
- *  \p pointer has pointer semantics: it may be dereferenced and manipulated with pointer arithmetic.
+ *  \p omp::pointer can be created with the function \p omp::malloc, or by
+ *  explicitly calling its constructor with a raw pointer.
  *
- *  \p pointer can be created with the function \p omp::malloc, or by explicitly calling its constructor
- *  with a raw pointer.
+ *  The raw pointer encapsulated by a \p omp::pointer may be obtained by eiter its
+ *  <tt>get</tt> member function or the \p raw_pointer_cast function.
  *
- *  The raw pointer encapsulated by a \p pointer may be obtained by eiter its <tt>get</tt> member function
- *  or the \p raw_pointer_cast function.
- *
- *  \note \p pointer is not a "smart" pointer; it is the programmer's responsibility to deallocate memory
- *  pointed to by \p pointer.
+ *  \note \p omp::pointer is not a "smart" pointer; it is the programmer's
+ *        responsibility to deallocate memory pointed to by \p omp::pointer.
  *
  *  \tparam T specifies the type of the pointee.
  *
@@ -135,226 +52,66 @@ template<typename Element>
  *  \see omp::free
  *  \see raw_pointer_cast
  */
-template<typename T>
-  class pointer
-    : public thrust::pointer<
-               T,
-               thrust::system::omp::tag,
-               thrust::system::omp::reference<T>,
-               thrust::system::omp::pointer<T>
-             >
-{
-  /*! \cond
-   */
-
-  private:
-    typedef thrust::pointer<
-      T,
-      thrust::system::omp::tag,
-      //thrust::system::omp::reference<T>,
-      typename detail::reference_msvc_workaround<T>::type,
-      thrust::system::omp::pointer<T>
-    > super_t;
-
-  /*! \endcond
-   */
-
-  public:
-    // note that omp::pointer's member functions need __host__ __device__
-    // to interoperate with nvcc + iterators' dereference member function
-
-    /*! \p pointer's no-argument constructor initializes its encapsulated pointer to \c 0.
-     */
-    __host__ __device__
-    pointer() : super_t() {}
-
-    #if THRUST_CPP_DIALECT >= 2011
-    // NOTE: This is needed so that Thrust smart pointers can be used in
-    // `std::unique_ptr`.
-    __host__ __device__
-    pointer(decltype(nullptr)) : super_t(nullptr) {}
-    #endif
-
-    /*! This constructor allows construction of a <tt>pointer<const T></tt> from a <tt>T*</tt>.
-     *
-     *  \param ptr A raw pointer to copy from, presumed to point to a location in memory
-     *         accessible by the \p omp system.
-     *  \tparam OtherT \p OtherT shall be convertible to \p T.
-     */
-    template<typename OtherT>
-    __host__ __device__
-    explicit pointer(OtherT *ptr) : super_t(ptr) {}
-
-    /*! This constructor allows construction from another pointer-like object with related type.
-     *
-     *  \param other The \p OtherPointer to copy.
-     *  \tparam OtherPointer The system tag associated with \p OtherPointer shall be convertible
-     *          to \p thrust::system::omp::tag and its element type shall be convertible to \p T.
-     */
-    template<typename OtherPointer>
-    __host__ __device__
-    pointer(const OtherPointer &other,
-            typename thrust::detail::enable_if_pointer_is_convertible<
-              OtherPointer,
-              pointer
-            >::type * = 0) : super_t(other) {}
-
-    /*! This constructor allows construction from another pointer-like object with \p void type.
-     *
-     *  \param other The \p OtherPointer to copy.
-     *  \tparam OtherPointer The system tag associated with \p OtherPointer shall be convertible
-     *          to \p thrust::system::omp::tag and its element type shall be \p void.
-     */
-    template<typename OtherPointer>
-    __host__ __device__
-    explicit
-    pointer(const OtherPointer &other,
-            typename thrust::detail::enable_if_void_pointer_is_system_convertible<
-              OtherPointer,
-              pointer
-            >::type * = 0) : super_t(other) {}
-
-    /*! Assignment operator allows assigning from another pointer-like object with related type.
-     *
-     *  \param other The other pointer-like object to assign from.
-     *  \tparam OtherPointer The system tag associated with \p OtherPointer shall be convertible
-     *          to \p thrust::system::omp::tag and its element type shall be convertible to \p T.
-     */
-    template<typename OtherPointer>
-    __host__ __device__
-    typename thrust::detail::enable_if_pointer_is_convertible<
-      OtherPointer,
-      pointer,
-      pointer &
-    >::type
-    operator=(const OtherPointer &other)
-    {
-      return super_t::operator=(other);
-    }
-
-    #if THRUST_CPP_DIALECT >= 2011
-    // NOTE: This is needed so that Thrust smart pointers can be used in
-    // `std::unique_ptr`.
-    __host__ __device__
-    pointer& operator=(decltype(nullptr))
-    {
-      super_t::operator=(nullptr);
-      return *this;
-    }
-    #endif
-}; // end pointer
-
-
-/*! \p reference is a wrapped reference to an object stored in memory available to the \p omp system.
- *  \p reference is the type of the result of dereferencing a \p omp::pointer.
+template <typename T>
+using pointer = thrust::pointer<
+  T,
+  thrust::system::omp::tag,
+  thrust::tagged_reference<T, thrust::system::omp::tag>
+>;
+
+/*! \p omp::universal_pointer stores a pointer to an object allocated in memory
+ * accessible by the \p omp system and host systems.
  *
- *  \tparam T Specifies the type of the referenced object.
+ *  \p omp::universal_pointer has pointer semantics: it may be dereferenced and
+ *  manipulated with pointer arithmetic.
+ *
+ *  \p omp::universal_pointer can be created with \p omp::universal_allocator
+ *  or by explicitly calling its constructor with a raw pointer.
+ *
+ *  The raw pointer encapsulated by a \p omp::universal_pointer may be obtained
+ *  by eiter its <tt>get</tt> member function or the \p raw_pointer_cast
+ *  function.
+ *
+ *  \note \p omp::universal_pointer is not a "smart" pointer; it is the
+ *        programmer's responsibility to deallocate memory pointed to by
+ *        \p omp::universal_pointer.
+ *
+ *  \tparam T specifies the type of the pointee.
+ *
+ *  \see omp::universal_allocator
+ *  \see raw_pointer_cast
  */
-template<typename T>
-  class reference
-    : public thrust::reference<
-               T,
-               thrust::system::omp::pointer<T>,
-               thrust::system::omp::reference<T>
-             >
-{
-  /*! \cond
-   */
-
-  private:
-    typedef thrust::reference<
-      T,
-      thrust::system::omp::pointer<T>,
-      thrust::system::omp::reference<T>
-    > super_t;
-
-  /*! \endcond
-   */
-
-  public:
-    /*! \cond
-     */
-
-    typedef typename super_t::value_type value_type;
-    typedef typename super_t::pointer    pointer;
-
-    /*! \endcond
-     */
-
-    /*! This constructor initializes this \p reference to refer to an object
-     *  pointed to by the given \p pointer. After this \p reference is constructed,
-     *  it shall refer to the object pointed to by \p ptr.
-     *
-     *  \param ptr A \p pointer to copy from.
-     */
-    __host__ __device__
-    explicit reference(const pointer &ptr)
-      : super_t(ptr)
-    {}
-
-    /*! This constructor accepts a const reference to another \p reference of related type.
-     *  After this \p reference is constructed, it shall refer to the same object as \p other.
-     *
-     *  \param other A \p reference to copy from.
-     *  \tparam OtherT The element type of the other \p reference.
-     *
-     *  \note This constructor is templated primarily to allow initialization of <tt>reference<const T></tt>
-     *        from <tt>reference<T></tt>.
-     */
-    template<typename OtherT>
-    __host__ __device__
-    reference(const reference<OtherT> &other,
-              typename thrust::detail::enable_if_convertible<
-                typename reference<OtherT>::pointer,
-                pointer
-              >::type * = 0)
-      : super_t(other)
-    {}
-
-    /*! Copy assignment operator copy assigns from another \p reference of related type.
-     *
-     *  \param other The other \p reference to assign from.
-     *  \return <tt>*this</tt>
-     *  \tparam OtherT The element type of the other \p reference.
-     */
-    template<typename OtherT>
-    reference &operator=(const reference<OtherT> &other);
-
-    /*! Assignment operator assigns from a \p value_type.
-     *
-     *  \param x The \p value_type to assign from.
-     *  \return <tt>*this</tt>
-     */
-    reference &operator=(const value_type &x);
-}; // end reference
-
-/*! Exchanges the values of two objects referred to by \p reference.
- *  \p x The first \p reference of interest.
- *  \p y The second \p reference of interest.
+template <typename T>
+using universal_pointer = thrust::pointer<
+  T,
+  thrust::system::omp::tag,
+  typename std::add_lvalue_reference<T>::type
+>;
+
+/*! \p reference is a wrapped reference to an object stored in memory available
+ *  to the \p omp system. \p reference is the type of the result of
+ *  dereferencing a \p omp::pointer.
+ *
+ *  \tparam T Specifies the type of the referenced object.
  */
-template<typename T>
-__host__ __device__
-void swap(reference<T> x, reference<T> y);
+template <typename T>
+using reference = thrust::tagged_reference<T, thrust::system::omp::tag>;
 
-} // end omp
+}} // namespace system::omp
 
-/*! \}
+/*! \addtogroup system_backends Systems
+ *  \ingroup system
+ *  \{
  */
 
-} // end system
-
 /*! \namespace thrust::omp
- *  \brief \p thrust::omp is a top-level alias for thrust::system::omp.
- */
+ *  \brief \p thrust::omp is a top-level alias for \p thrust::system::omp. */
 namespace omp
 {
-
 using thrust::system::omp::pointer;
+using thrust::system::omp::universal_pointer;
 using thrust::system::omp::reference;
+} // namespace omp
 
-} // end omp
-
-} // end thrust
-
-#include <thrust/system/omp/detail/pointer.inl>
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/omp/vector.h b/thrust/system/omp/vector.h
index 6ad2bafed..179b5207d 100644
--- a/thrust/system/omp/vector.h
+++ b/thrust/system/omp/vector.h
@@ -26,145 +26,57 @@
 #include <thrust/detail/vector_base.h>
 #include <vector>
 
-namespace thrust
+THRUST_NAMESPACE_BEGIN
+namespace system { namespace omp
 {
 
-// forward declaration of host_vector
-// XXX why is this here? it doesn't seem necessary for anything below
-template<typename T, typename Allocator> class host_vector;
-
-namespace system
-{
-namespace omp
-{
-
-// XXX upon c++11
-// template<typename T, typename Allocator = allocator<T> > using vector = thrust::detail::vector_base<T,Allocator>;
-
 /*! \p omp::vector is a container that supports random access to elements,
  *  constant time removal of elements at the end, and linear time insertion
  *  and removal of elements at the beginning or in the middle. The number of
  *  elements in a \p omp::vector may vary dynamically; memory management is
  *  automatic. The elements contained in an \p omp::vector reside in memory
- *  available to the \p omp system.
+ *  accessible by the \p omp system.
  *
  *  \tparam T The element type of the \p omp::vector.
- *  \tparam Allocator The allocator type of the \p omp::vector. Defaults to \p omp::allocator.
+ *  \tparam Allocator The allocator type of the \p omp::vector.
+ *          Defaults to \p omp::allocator.
  *
- *  \see http://www.sgi.com/tech/stl/Vector.html
+ *  \see https://en.cppreference.com/w/cpp/container/vector
  *  \see host_vector For the documentation of the complete interface which is
- *                   shared by \p omp::vector
+ *                   shared by \p omp::vector.
  *  \see device_vector
+ *  \see universal_vector
  */
-template<typename T, typename Allocator = allocator<T> >
-  class vector
-    : public thrust::detail::vector_base<T,Allocator>
-{
-  /*! \cond
-   */
-  private:
-    typedef thrust::detail::vector_base<T,Allocator> super_t;
-  /*! \endcond
-   */
-
-  public:
-    
-  /*! \cond
-   */
-    typedef typename super_t::size_type  size_type;
-    typedef typename super_t::value_type value_type;
-  /*! \endcond
-   */
-
-    /*! This constructor creates an empty \p omp::vector.
-     */
-    vector();
-
-    /*! This constructor creates a \p omp::vector with \p n default-constructed elements.
-     *  \param n The size of the \p omp::vector to create.
-     */
-    explicit vector(size_type n);
-
-    /*! This constructor creates a \p omp::vector with \p n copies of \p value.
-     *  \param n The size of the \p omp::vector to create.
-     *  \param value An element to copy.
-     */
-    explicit vector(size_type n, const value_type &value);
-
-    /*! Copy constructor copies from another \p omp::vector.
-     *  \param x The other \p omp::vector to copy.
-     */
-    vector(const vector &x);
-
-  #if __cplusplus >= 201103L
-    /*! Move constructor moves another \p omp::vector.
-     *  \param x The other \p omp::vector to move from.
-     */
-    vector(vector &&x);
-  #endif
-
-    /*! This constructor copies from another Thrust vector-like object.
-     *  \param x The other object to copy from.
-     */
-    template<typename OtherT, typename OtherAllocator>
-    vector(const thrust::detail::vector_base<OtherT,OtherAllocator> &x);
-
-    /*! This constructor copies from a \c std::vector.
-     *  \param x The \c std::vector to copy from.
-     */
-    template<typename OtherT, typename OtherAllocator>
-    vector(const std::vector<OtherT,OtherAllocator> &x);
-
-    /*! This constructor creates an \p omp::vector by copying from a range.
-     *  \param first The beginning of the range.
-     *  \param last The end of the range.
-     */
-    template<typename InputIterator>
-    vector(InputIterator first, InputIterator last);
-
-    // XXX vector_base should take a Derived type so we don't have to define these superfluous assigns
-
-    /*! Copy assignment operator assigns from another \p omp::vector.
-    *  \param x The other object to assign from.
-    *  \return <tt>*this</tt>
-    */
-   vector &operator=(const vector &x);
-
-  #if __cplusplus >= 201103L
-    /*! Move assignment operator moves another \p omp::vector.
-     *  \param x The other \p omp::vector to move.
-     *  \return <tt>*this</tt>
-     */
-     vector &operator=(vector &&x);
-  #endif
-
-    /*! Assignment operator assigns from a \c std::vector.
-     *  \param x The \c std::vector to assign from.
-     *  \return <tt>*this</tt>
-     */
-    template<typename OtherT, typename OtherAllocator>
-    vector &operator=(const std::vector<OtherT,OtherAllocator> &x);
-
-    /*! Assignment operator assigns from another Thrust vector-like object.
-     *  \param x The other object to assign from.
-     *  \return <tt>*this</tt>
-     */
-    template<typename OtherT, typename OtherAllocator>
-    vector &operator=(const thrust::detail::vector_base<OtherT,OtherAllocator> &x);
-}; // end vector
+template <typename T, typename Allocator = thrust::system::omp::allocator<T>>
+using vector = thrust::detail::vector_base<T, Allocator>;
+
+/*! \p omp::universal_vector is a container that supports random access to
+ *  elements, constant time removal of elements at the end, and linear time
+ *  insertion and removal of elements at the beginning or in the middle. The
+ *  number of elements in a \p omp::universal_vector may vary dynamically;
+ *  memory management is automatic. The elements contained in a
+ *  \p omp::universal_vector reside in memory accessible by the \p omp system
+ *  and host systems.
+ *
+ *  \tparam T The element type of the \p omp::universal_vector.
+ *  \tparam Allocator The allocator type of the \p omp::universal_vector.
+ *          Defaults to \p omp::universal_allocator.
+ *
+ *  \see https://en.cppreference.com/w/cpp/container/vector
+ *  \see host_vector For the documentation of the complete interface which is
+ *                   shared by \p omp::universal_vector
+ *  \see device_vector
+ *  \see universal_vector
+ */
+template <typename T, typename Allocator = thrust::system::omp::universal_allocator<T>>
+using universal_vector = thrust::detail::vector_base<T, Allocator>;
 
-} // end omp
-} // end system
+}} // namespace system::omp
 
-// alias system::omp names at top-level
 namespace omp
 {
-
 using thrust::system::omp::vector;
+using thrust::system::omp::universal_vector;
+}
 
-} // end omp
-
-} // end thrust
-
-#include <thrust/system/omp/detail/vector.inl>
-
+THRUST_NAMESPACE_END
diff --git a/thrust/system/system_error.h b/thrust/system/system_error.h
index 84e453dc6..fb31a2da8 100644
--- a/thrust/system/system_error.h
+++ b/thrust/system/system_error.h
@@ -28,8 +28,7 @@
 
 #include <thrust/system/error_code.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace system
 {
@@ -141,19 +140,19 @@ class system_error
 
     /*! Destructor does not throw.
      */
-    inline virtual ~system_error(void) throw () {};
+    inline virtual ~system_error(void) noexcept {};
     
     /*! Returns an object encoding the error.
      *  \return <tt>ec</tt> or <tt>error_code(ev, ecat)</tt>, from the
      *          constructor, as appropriate.
      */
-    inline const error_code &code(void) const throw();
+    inline const error_code &code(void) const noexcept;
 
     /*! Returns a human-readable string indicating the nature of the error.
      *  \return a string incorporating <tt>code().message()</tt> and the
      *          arguments supplied in the constructor.
      */
-    inline const char *what(void) const throw();
+    inline const char *what(void) const noexcept;
 
     /*! \cond
      */
@@ -173,7 +172,7 @@ class system_error
 // import names into thrust::
 using system::system_error;
 
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/system_error.inl>
 
diff --git a/thrust/system/tbb/detail/adjacent_difference.h b/thrust/system/tbb/detail/adjacent_difference.h
index d22b4aac3..ab519d11e 100644
--- a/thrust/system/tbb/detail/adjacent_difference.h
+++ b/thrust/system/tbb/detail/adjacent_difference.h
@@ -20,8 +20,7 @@
 #include <thrust/system/tbb/detail/execution_policy.h>
 #include <thrust/system/detail/generic/adjacent_difference.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace tbb
@@ -46,5 +45,5 @@ template<typename DerivedPolicy,
 } // end detail
 } // end tbb
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/tbb/detail/copy.h b/thrust/system/tbb/detail/copy.h
index 7977768b0..30e95a98c 100644
--- a/thrust/system/tbb/detail/copy.h
+++ b/thrust/system/tbb/detail/copy.h
@@ -19,8 +19,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/tbb/detail/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace tbb
@@ -51,7 +50,7 @@ OutputIterator copy_n(execution_policy<DerivedPolicy> &exec,
 } // end namespace detail
 } // end namespace tbb
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/tbb/detail/copy.inl>
 
diff --git a/thrust/system/tbb/detail/copy.inl b/thrust/system/tbb/detail/copy.inl
index 0d96ad48b..1016f40d4 100644
--- a/thrust/system/tbb/detail/copy.inl
+++ b/thrust/system/tbb/detail/copy.inl
@@ -23,8 +23,7 @@
 #include <thrust/detail/type_traits/minimum_type.h>
 #include <thrust/detail/copy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace tbb
@@ -133,5 +132,5 @@ OutputIterator copy_n(execution_policy<DerivedPolicy> &exec,
 } // end namespace detail
 } // end namespace tbb
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/tbb/detail/copy_if.h b/thrust/system/tbb/detail/copy_if.h
index 0420893ba..db860f377 100644
--- a/thrust/system/tbb/detail/copy_if.h
+++ b/thrust/system/tbb/detail/copy_if.h
@@ -19,8 +19,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/tbb/detail/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace tbb
@@ -44,7 +43,7 @@ template<typename InputIterator1,
 } // end detail
 } // end tbb
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/tbb/detail/copy_if.inl>
 
diff --git a/thrust/system/tbb/detail/copy_if.inl b/thrust/system/tbb/detail/copy_if.inl
index 9c074a9fc..aa2379b8d 100644
--- a/thrust/system/tbb/detail/copy_if.inl
+++ b/thrust/system/tbb/detail/copy_if.inl
@@ -24,8 +24,7 @@
 #include <tbb/blocked_range.h>
 #include <tbb/parallel_scan.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace tbb
@@ -127,5 +126,5 @@ template<typename InputIterator1,
 } // end detail
 } // end tbb
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/tbb/detail/execution_policy.h b/thrust/system/tbb/detail/execution_policy.h
index 1773f3c06..ac4a788e7 100644
--- a/thrust/system/tbb/detail/execution_policy.h
+++ b/thrust/system/tbb/detail/execution_policy.h
@@ -21,8 +21,7 @@
 #include <thrust/iterator/detail/any_system_tag.h>
 #include <thrust/detail/type_traits.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 // put the canonical tag in the same ns as the backend's entry points
@@ -79,5 +78,5 @@ using thrust::system::tbb::execution_policy;
 using thrust::system::tbb::tag;
 
 } // end tbb
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/tbb/detail/extrema.h b/thrust/system/tbb/detail/extrema.h
index e0dd4c042..c6c747f42 100644
--- a/thrust/system/tbb/detail/extrema.h
+++ b/thrust/system/tbb/detail/extrema.h
@@ -20,8 +20,7 @@
 #include <thrust/system/tbb/detail/execution_policy.h>
 #include <thrust/system/detail/generic/extrema.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace tbb
@@ -62,6 +61,6 @@ thrust::pair<ForwardIterator,ForwardIterator> minmax_element(execution_policy<De
 } // end detail
 } // end tbb
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
 
diff --git a/thrust/system/tbb/detail/find.h b/thrust/system/tbb/detail/find.h
index e07d322a8..e5dea8e77 100644
--- a/thrust/system/tbb/detail/find.h
+++ b/thrust/system/tbb/detail/find.h
@@ -20,8 +20,7 @@
 #include <thrust/system/detail/generic/find.h>
 #include <thrust/system/tbb/detail/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace tbb
@@ -42,5 +41,5 @@ InputIterator find_if(execution_policy<DerivedPolicy> &exec,
 } // end namespace detail
 } // end namespace tbb
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/tbb/detail/for_each.h b/thrust/system/tbb/detail/for_each.h
index dfe5329b8..26c4b539b 100644
--- a/thrust/system/tbb/detail/for_each.h
+++ b/thrust/system/tbb/detail/for_each.h
@@ -19,8 +19,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/tbb/detail/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace tbb
@@ -48,7 +47,7 @@ template<typename DerivedPolicy,
 } // end namespace detail
 } // end namespace tbb
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/tbb/detail/for_each.inl>
 
diff --git a/thrust/system/tbb/detail/for_each.inl b/thrust/system/tbb/detail/for_each.inl
index 00e025ea0..21dfce9ae 100644
--- a/thrust/system/tbb/detail/for_each.inl
+++ b/thrust/system/tbb/detail/for_each.inl
@@ -14,17 +14,19 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/detail/static_assert.h>
 #include <thrust/distance.h>
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/distance.h>
 #include <thrust/system/detail/sequential/execution_policy.h>
+
 #include <tbb/blocked_range.h>
 #include <tbb/parallel_for.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace tbb
@@ -78,7 +80,7 @@ RandomAccessIterator for_each_n(execution_policy<DerivedPolicy> &,
 
   // return the end of the range
   return first + n;
-} // end for_each_n 
+} // end for_each_n
 
 
 template<typename DerivedPolicy,
@@ -96,5 +98,5 @@ template<typename DerivedPolicy,
 } // end namespace detail
 } // end namespace tbb
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/tbb/detail/memory.inl b/thrust/system/tbb/detail/memory.inl
index 216480d59..32e28300a 100644
--- a/thrust/system/tbb/detail/memory.inl
+++ b/thrust/system/tbb/detail/memory.inl
@@ -14,14 +14,15 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/system/cpp/detail/execution_policy.h>
 #include <thrust/system/tbb/memory.h>
 #include <thrust/system/cpp/memory.h>
 #include <limits>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace tbb
@@ -82,5 +83,5 @@ inline void free(pointer<void> ptr)
 
 } // end tbb
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/tbb/detail/merge.h b/thrust/system/tbb/detail/merge.h
index 44608959c..014e2eb8b 100644
--- a/thrust/system/tbb/detail/merge.h
+++ b/thrust/system/tbb/detail/merge.h
@@ -19,8 +19,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/tbb/detail/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace tbb
@@ -64,7 +63,7 @@ thrust::pair<OutputIterator1,OutputIterator2>
 } // end detail
 } // end tbb
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/tbb/detail/merge.inl>
 
diff --git a/thrust/system/tbb/detail/merge.inl b/thrust/system/tbb/detail/merge.inl
index bcc728546..89a01aebf 100644
--- a/thrust/system/tbb/detail/merge.inl
+++ b/thrust/system/tbb/detail/merge.inl
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2021 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -14,6 +14,10 @@
  *  limitations under the License.
  */
 
+#pragma once
+
+#include <thrust/detail/config.h>
+
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/detail/temporary_array.h>
 #include <thrust/system/tbb/detail/execution_policy.h>
@@ -22,8 +26,7 @@
 #include <thrust/detail/seq.h>
 #include <tbb/parallel_for.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace tbb
@@ -54,7 +57,7 @@ struct range
       first2(first2), last2(last2),
       result(result), comp(comp), grain_size(grain_size)
   {}
-  
+
   range(range& r, ::tbb::split)
     : first1(r.first1), last1(r.last1),
       first2(r.first2), last2(r.last2),
@@ -77,7 +80,7 @@ struct range
       mid2 += n2 / 2;
       mid1 = thrust::upper_bound(thrust::seq, first1, last1, raw_reference_cast(*mid2), comp);
     }
-    
+
     // set first range to [first1, mid1), [first2, mid2), result
     r.last1 = mid1;
     r.last2 = mid2;
@@ -150,7 +153,7 @@ struct range
       keys_result(keys_result), values_result(values_result),
       comp(comp), grain_size(grain_size)
   {}
-  
+
   range(range& r, ::tbb::split)
     : keys_first1(r.keys_first1), keys_last1(r.keys_last1),
       keys_first2(r.keys_first2), keys_last2(r.keys_last2),
@@ -176,12 +179,12 @@ struct range
       mid2 += n2 / 2;
       mid1 = thrust::upper_bound(thrust::seq, keys_first1, keys_last1, raw_reference_cast(*mid2), comp);
     }
-    
+
     // set first range to [keys_first1, mid1), [keys_first2, mid2), keys_result, values_result
     r.keys_last1 = mid1;
     r.keys_last2 = mid2;
 
-    // set second range to [mid1, keys_last1), [mid2, keys_last2), keys_result + (mid1 - keys_first1) + (mid2 - keys_first2), values_result + (mid1 - keys_first1) + (mid2 - keys_first2) 
+    // set second range to [mid1, keys_last1), [mid2, keys_last2), keys_result + (mid1 - keys_first1) + (mid2 - keys_first2), values_result + (mid1 - keys_first1) + (mid2 - keys_first2)
     keys_first1 = mid1;
     keys_first2 = mid2;
     values_first1 += thrust::distance(r.keys_first1, mid1);
@@ -225,7 +228,7 @@ template<typename DerivedPolicy,
          typename InputIterator2,
          typename OutputIterator,
          typename StrictWeakOrdering>
-OutputIterator merge(execution_policy<DerivedPolicy> &exec,
+OutputIterator merge(execution_policy<DerivedPolicy> &,
                      InputIterator1 first1,
                      InputIterator1 last1,
                      InputIterator2 first2,
@@ -254,7 +257,7 @@ template <typename DerivedPolicy,
           typename OutputIterator2,
           typename StrictWeakOrdering>
 thrust::pair<OutputIterator1,OutputIterator2>
-  merge_by_key(execution_policy<DerivedPolicy> &exec,
+  merge_by_key(execution_policy<DerivedPolicy> &,
                InputIterator1 keys_first1,
                InputIterator1 keys_last1,
                InputIterator2 keys_first2,
@@ -282,5 +285,5 @@ thrust::pair<OutputIterator1,OutputIterator2>
 } // end namespace detail
 } // end namespace tbb
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/tbb/detail/par.h b/thrust/system/tbb/detail/par.h
index d5f35b6d0..308d41e13 100644
--- a/thrust/system/tbb/detail/par.h
+++ b/thrust/system/tbb/detail/par.h
@@ -20,8 +20,7 @@
 #include <thrust/detail/allocator_aware_execution_policy.h>
 #include <thrust/system/tbb/detail/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace tbb
@@ -35,7 +34,7 @@ struct par_t : thrust::system::tbb::detail::execution_policy<par_t>,
     thrust::system::tbb::detail::execution_policy>
 {
   __host__ __device__
-  par_t() : thrust::system::tbb::detail::execution_policy<par_t>() {}
+  constexpr par_t() : thrust::system::tbb::detail::execution_policy<par_t>() {}
 };
 
 
@@ -58,5 +57,5 @@ using thrust::system::tbb::par;
 
 
 } // end tbb
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/tbb/detail/partition.h b/thrust/system/tbb/detail/partition.h
index 80323535c..f9c56b92b 100644
--- a/thrust/system/tbb/detail/partition.h
+++ b/thrust/system/tbb/detail/partition.h
@@ -20,8 +20,7 @@
 #include <thrust/system/tbb/detail/execution_policy.h>
 #include <thrust/pair.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace tbb
@@ -81,7 +80,7 @@ template<typename DerivedPolicy,
 } // end namespace detail
 } // end namespace tbb
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/tbb/detail/partition.inl>
 
diff --git a/thrust/system/tbb/detail/partition.inl b/thrust/system/tbb/detail/partition.inl
index 5085ed906..74ad809da 100644
--- a/thrust/system/tbb/detail/partition.inl
+++ b/thrust/system/tbb/detail/partition.inl
@@ -20,8 +20,7 @@
 #include <thrust/system/tbb/detail/partition.h>
 #include <thrust/system/detail/generic/partition.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace tbb
@@ -98,5 +97,5 @@ template<typename DerivedPolicy,
 } // end namespace detail
 } // end namespace tbb
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/tbb/detail/pointer.inl b/thrust/system/tbb/detail/pointer.inl
deleted file mode 100644
index 2b21422bc..000000000
--- a/thrust/system/tbb/detail/pointer.inl
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- *  Copyright 2008-2018 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-namespace thrust
-{
-namespace system
-{
-namespace tbb
-{
-
-
-template<typename T>
-  template<typename OtherT>
-    reference<T> &
-      reference<T>
-        ::operator=(const reference<OtherT> &other)
-{
-  return super_t::operator=(other);
-} // end reference::operator=()
-
-template<typename T>
-  reference<T> &
-    reference<T>
-      ::operator=(const value_type &x)
-{
-  return super_t::operator=(x);
-} // end reference::operator=()
-
-template<typename T>
-__host__ __device__
-void swap(reference<T> a, reference<T> b)
-{
-  a.swap(b);
-} // end swap()
-
-} // end tbb
-} // end system
-} // end thrust
-
diff --git a/thrust/system/tbb/detail/reduce.h b/thrust/system/tbb/detail/reduce.h
index 7381da382..81e8d1f6f 100644
--- a/thrust/system/tbb/detail/reduce.h
+++ b/thrust/system/tbb/detail/reduce.h
@@ -24,8 +24,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/tbb/detail/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace tbb
@@ -48,7 +47,7 @@ template<typename DerivedPolicy,
 } // end namespace detail
 } // end namespace tbb
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/tbb/detail/reduce.inl>
 
diff --git a/thrust/system/tbb/detail/reduce.inl b/thrust/system/tbb/detail/reduce.inl
index 22a13f63d..47fe6616d 100644
--- a/thrust/system/tbb/detail/reduce.inl
+++ b/thrust/system/tbb/detail/reduce.inl
@@ -26,8 +26,7 @@
 #include <tbb/blocked_range.h>
 #include <tbb/parallel_reduce.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace tbb
@@ -100,7 +99,7 @@ template<typename DerivedPolicy,
          typename InputIterator, 
          typename OutputType,
          typename BinaryFunction>
-  OutputType reduce(execution_policy<DerivedPolicy> &exec,
+  OutputType reduce(execution_policy<DerivedPolicy> &,
                     InputIterator begin,
                     InputIterator end,
                     OutputType init,
@@ -127,5 +126,5 @@ template<typename DerivedPolicy,
 } // end namespace detail
 } // end namespace tbb
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/tbb/detail/reduce_by_key.h b/thrust/system/tbb/detail/reduce_by_key.h
index d8e3b38c5..04d46e7c0 100644
--- a/thrust/system/tbb/detail/reduce_by_key.h
+++ b/thrust/system/tbb/detail/reduce_by_key.h
@@ -20,8 +20,7 @@
 #include <thrust/system/tbb/detail/execution_policy.h>
 #include <thrust/pair.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace tbb
@@ -51,7 +50,7 @@ template<typename DerivedPolicy,
 } // end namespace detail
 } // end namespace tbb
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/tbb/detail/reduce_by_key.inl>
 
diff --git a/thrust/system/tbb/detail/reduce_by_key.inl b/thrust/system/tbb/detail/reduce_by_key.inl
index a9516e4a1..693abb2e7 100644
--- a/thrust/system/tbb/detail/reduce_by_key.inl
+++ b/thrust/system/tbb/detail/reduce_by_key.inl
@@ -27,12 +27,12 @@
 #include <thrust/detail/range/tail_flags.h>
 #include <tbb/blocked_range.h>
 #include <tbb/parallel_for.h>
-#include <tbb/tbb_thread.h>
+
 #include <cassert>
+#include <thread>
 
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace tbb
@@ -198,7 +198,7 @@ template<typename Iterator1, typename Iterator2, typename Iterator3, typename It
     const size_type interval_idx = r.begin();
 
     const size_type offset_to_first = interval_size * interval_idx;
-    const size_type offset_to_last = thrust::min(n, offset_to_first + interval_size);
+    const size_type offset_to_last = (thrust::min)(n, offset_to_first + interval_size);
 
     Iterator1 my_keys_first     = keys_first    + offset_to_first;
     Iterator1 my_keys_last      = keys_first    + offset_to_last;
@@ -281,7 +281,7 @@ template<typename DerivedPolicy, typename Iterator1, typename Iterator2, typenam
   }
 
   // count the number of processors
-  const unsigned int p = thrust::max<unsigned int>(1u, ::tbb::tbb_thread::hardware_concurrency());
+  const unsigned int p = thrust::max<unsigned int>(1u, std::thread::hardware_concurrency());
 
   // generate O(P) intervals of sequential work
   // XXX oversubscribing is a tuning opportunity
@@ -337,5 +337,5 @@ template<typename DerivedPolicy, typename Iterator1, typename Iterator2, typenam
 } // end detail
 } // end tbb
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/tbb/detail/reduce_intervals.h b/thrust/system/tbb/detail/reduce_intervals.h
index 88fefe43d..7164c3f97 100644
--- a/thrust/system/tbb/detail/reduce_intervals.h
+++ b/thrust/system/tbb/detail/reduce_intervals.h
@@ -27,8 +27,7 @@
 #include <thrust/reduce.h>
 #include <cassert>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace tbb
@@ -65,7 +64,7 @@ template<typename RandomAccessIterator1, typename RandomAccessIterator2, typenam
     Size interval_idx = r.begin();
 
     Size offset_to_first = interval_size * interval_idx;
-    Size offset_to_last = thrust::min(n, offset_to_first + interval_size);
+    Size offset_to_last = (thrust::min)(n, offset_to_first + interval_size);
 
     RandomAccessIterator1 my_first = first + offset_to_first;
     RandomAccessIterator1 my_last  = first + offset_to_last;
@@ -121,5 +120,5 @@ template<typename DerivedPolicy, typename RandomAccessIterator1, typename Size,
 } // end detail
 } // end tbb
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/tbb/detail/remove.h b/thrust/system/tbb/detail/remove.h
index 49f70588d..34cd91799 100644
--- a/thrust/system/tbb/detail/remove.h
+++ b/thrust/system/tbb/detail/remove.h
@@ -19,8 +19,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/omp/detail/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace omp
@@ -75,7 +74,7 @@ template<typename ExecutionPolicy,
 } // end namespace detail
 } // end namespace omp
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/tbb/detail/remove.inl>
 
diff --git a/thrust/system/tbb/detail/remove.inl b/thrust/system/tbb/detail/remove.inl
index 0a937799d..76d77e64b 100644
--- a/thrust/system/tbb/detail/remove.inl
+++ b/thrust/system/tbb/detail/remove.inl
@@ -20,8 +20,7 @@
 #include <thrust/system/tbb/detail/remove.h>
 #include <thrust/system/detail/generic/remove.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace tbb
@@ -90,5 +89,5 @@ template<typename DerivedPolicy,
 } // end namespace detail
 } // end namespace tbb
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/tbb/detail/scan.h b/thrust/system/tbb/detail/scan.h
index 32a05a5a6..b31b46317 100644
--- a/thrust/system/tbb/detail/scan.h
+++ b/thrust/system/tbb/detail/scan.h
@@ -24,8 +24,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/tbb/detail/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace tbb
@@ -58,7 +57,7 @@ template<typename InputIterator,
 } // end namespace detail
 } // end namespace tbb
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/tbb/detail/scan.inl>
 
diff --git a/thrust/system/tbb/detail/scan.inl b/thrust/system/tbb/detail/scan.inl
index 477c04ee3..d6e894983 100644
--- a/thrust/system/tbb/detail/scan.inl
+++ b/thrust/system/tbb/detail/scan.inl
@@ -28,8 +28,7 @@
 #include <tbb/blocked_range.h>
 #include <tbb/parallel_scan.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace tbb
@@ -104,7 +103,12 @@ struct inclusive_body
 
   void reverse_join(inclusive_body& b)
   {
-    sum = binary_op(b.sum, sum);
+    // Only accumulate this functor's partial sum if this functor has been
+    // called at least once -- otherwise we'll over-count the initial value.
+    if (!first_call)
+    {
+      sum = binary_op(b.sum, sum);
+    }
   } 
 
   void assign(inclusive_body& b)
@@ -172,8 +176,13 @@ struct exclusive_body
 
   void reverse_join(exclusive_body& b)
   {
-    sum = binary_op(b.sum, sum);
-  } 
+    // Only accumulate this functor's partial sum if this functor has been
+    // called at least once -- otherwise we'll over-count the initial value.
+    if (!first_call)
+    {
+      sum = binary_op(b.sum, sum);
+    }
+  }
 
   void assign(exclusive_body& b)
   {
@@ -183,8 +192,6 @@ struct exclusive_body
 
 } // end scan_detail
 
-
-
 template<typename InputIterator,
          typename OutputIterator,
          typename BinaryFunction>
@@ -194,32 +201,12 @@ template<typename InputIterator,
                                 OutputIterator result,
                                 BinaryFunction binary_op)
 {
-  // the pseudocode for deducing the type of the temporary used below:
-  // 
-  // if BinaryFunction is AdaptableBinaryFunction
-  //   TemporaryType = AdaptableBinaryFunction::result_type
-  // else if OutputIterator is a "pure" output iterator
-  //   TemporaryType = InputIterator::value_type
-  // else
-  //   TemporaryType = OutputIterator::value_type
-  //
-  // XXX upon c++0x, TemporaryType needs to be:
-  // result_of_adaptable_function<BinaryFunction>::type
-  
   using namespace thrust::detail;
 
-  typedef typename eval_if<
-    has_result_type<BinaryFunction>::value,
-    result_type<BinaryFunction>,
-    eval_if<
-      is_output_iterator<OutputIterator>::value,
-      thrust::iterator_value<InputIterator>,
-      thrust::iterator_value<OutputIterator>
-    >
-  >::type ValueType;
-  
-  typedef typename thrust::iterator_difference<InputIterator>::type Size; 
-  
+  // Use the input iterator's value type per https://wg21.link/P0571
+  using ValueType = typename thrust::iterator_value<InputIterator>::type;
+
+  using Size = typename thrust::iterator_difference<InputIterator>::type;
   Size n = thrust::distance(first, last);
 
   if (n != 0)
@@ -228,50 +215,29 @@ template<typename InputIterator,
     Body scan_body(first, result, binary_op, *first);
     ::tbb::parallel_scan(::tbb::blocked_range<Size>(0,n), scan_body);
   }
- 
+
   thrust::advance(result, n);
 
   return result;
 }
 
-
 template<typename InputIterator,
          typename OutputIterator,
-         typename T,
+         typename InitialValueType,
          typename BinaryFunction>
   OutputIterator exclusive_scan(tag,
                                 InputIterator first,
                                 InputIterator last,
                                 OutputIterator result,
-                                T init,
+                                InitialValueType init,
                                 BinaryFunction binary_op)
 {
-  // the pseudocode for deducing the type of the temporary used below:
-  // 
-  // if BinaryFunction is AdaptableBinaryFunction
-  //   TemporaryType = AdaptableBinaryFunction::result_type
-  // else if OutputIterator is a "pure" output iterator
-  //   TemporaryType = InputIterator::value_type
-  // else
-  //   TemporaryType = OutputIterator::value_type
-  //
-  // XXX upon c++0x, TemporaryType needs to be:
-  // result_of_adaptable_function<BinaryFunction>::type
-
   using namespace thrust::detail;
 
-  typedef typename eval_if<
-    has_result_type<BinaryFunction>::value,
-    result_type<BinaryFunction>,
-    eval_if<
-      is_output_iterator<OutputIterator>::value,
-      thrust::iterator_value<InputIterator>,
-      thrust::iterator_value<OutputIterator>
-    >
-  >::type ValueType;
-
-  typedef typename thrust::iterator_difference<InputIterator>::type Size; 
-  
+  // Use the initial value type per https://wg21.link/P0571
+  using ValueType = InitialValueType;
+
+  using Size = typename thrust::iterator_difference<InputIterator>::type;
   Size n = thrust::distance(first, last);
 
   if (n != 0)
@@ -280,7 +246,7 @@ template<typename InputIterator,
     Body scan_body(first, result, binary_op, init);
     ::tbb::parallel_scan(::tbb::blocked_range<Size>(0,n), scan_body);
   }
- 
+
   thrust::advance(result, n);
 
   return result;
@@ -289,5 +255,4 @@ template<typename InputIterator,
 } // end namespace detail
 } // end namespace tbb
 } // end namespace system
-} // end namespace thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/system/tbb/detail/sort.h b/thrust/system/tbb/detail/sort.h
index 863189a1e..9c58bf6d4 100644
--- a/thrust/system/tbb/detail/sort.h
+++ b/thrust/system/tbb/detail/sort.h
@@ -19,8 +19,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/tbb/detail/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace tbb
@@ -49,7 +48,7 @@ template<typename DerivedPolicy,
 } // end namespace detail
 } // end namespace tbb
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/tbb/detail/sort.inl>
 
diff --git a/thrust/system/tbb/detail/sort.inl b/thrust/system/tbb/detail/sort.inl
index ec3b34cf1..103710fba 100644
--- a/thrust/system/tbb/detail/sort.inl
+++ b/thrust/system/tbb/detail/sort.inl
@@ -14,17 +14,19 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/detail/temporary_array.h>
 #include <thrust/detail/copy.h>
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/distance.h>
 #include <thrust/merge.h>
+#include <thrust/sort.h>
 #include <thrust/detail/seq.h>
 #include <tbb/parallel_invoke.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace tbb
@@ -38,7 +40,7 @@ namespace sort_detail
 // TODO tune this based on data type and comp
 const static int threshold = 128 * 1024;
 
-  
+
 template<typename DerivedPolicy, typename Iterator1, typename Iterator2, typename StrictWeakOrdering>
 void merge_sort(execution_policy<DerivedPolicy> &exec, Iterator1 first1, Iterator1 last1, Iterator2 first2, StrictWeakOrdering comp, bool inplace);
 
@@ -73,7 +75,7 @@ void merge_sort(execution_policy<DerivedPolicy> &exec, Iterator1 first1, Iterato
   if (n < threshold)
   {
     thrust::stable_sort(thrust::seq, first1, last1, comp);
-    
+
     if(!inplace)
     {
       thrust::copy(thrust::seq, first1, last1, first2);
@@ -87,7 +89,7 @@ void merge_sort(execution_policy<DerivedPolicy> &exec, Iterator1 first1, Iterato
   Iterator2 last2 = first2 + n;
 
   typedef merge_sort_closure<DerivedPolicy,Iterator1,Iterator2,StrictWeakOrdering> Closure;
-  
+
   Closure left (exec, first1, mid1,  first2, comp, !inplace);
   Closure right(exec, mid1,   last1, mid2,   comp, !inplace);
 
@@ -108,7 +110,7 @@ namespace sort_by_key_detail
 // TODO tune this based on data type and comp
 const static int threshold = 128 * 1024;
 
-  
+
 template<typename DerivedPolicy,
          typename Iterator1,
          typename Iterator2,
@@ -177,7 +179,7 @@ void merge_sort_by_key(execution_policy<DerivedPolicy> &exec,
   typedef typename thrust::iterator_difference<Iterator1>::type difference_type;
 
   difference_type n = thrust::distance(first1, last1);
-  
+
   Iterator1 mid1  = first1 + (n / 2);
   Iterator2 mid2  = first2 + (n / 2);
   Iterator3 mid3  = first3 + (n / 2);
@@ -188,7 +190,7 @@ void merge_sort_by_key(execution_policy<DerivedPolicy> &exec,
   if (n < threshold)
   {
     thrust::stable_sort_by_key(thrust::seq, first1, last1, first2, comp);
-    
+
     if(!inplace)
     {
       thrust::copy(thrust::seq, first1, last1, first3);
@@ -199,7 +201,7 @@ void merge_sort_by_key(execution_policy<DerivedPolicy> &exec,
   }
 
   typedef merge_sort_by_key_closure<DerivedPolicy,Iterator1,Iterator2,Iterator3,Iterator4,StrictWeakOrdering> Closure;
-  
+
   Closure left (exec, first1, mid1,  first2, first3, first4, comp, !inplace);
   Closure right(exec, mid1,   last1, mid2,   mid3,   mid4,   comp, !inplace);
 
@@ -260,5 +262,5 @@ template<typename DerivedPolicy,
 } // end namespace detail
 } // end namespace tbb
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/tbb/detail/unique.h b/thrust/system/tbb/detail/unique.h
index 2e46d2bb4..843e6406e 100644
--- a/thrust/system/tbb/detail/unique.h
+++ b/thrust/system/tbb/detail/unique.h
@@ -20,8 +20,7 @@
 #include <thrust/system/tbb/detail/execution_policy.h>
 #include <thrust/pair.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace tbb
@@ -50,10 +49,20 @@ template<typename ExecutionPolicy,
                              BinaryPredicate binary_pred);
 
 
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename BinaryPredicate>
+  typename thrust::iterator_traits<ForwardIterator>::difference_type
+    unique_count(execution_policy<DerivedPolicy> &exec,
+                 ForwardIterator first,
+                 ForwardIterator last,
+                 BinaryPredicate binary_pred);
+
+
 } // end namespace detail
 } // end namespace tbb 
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/tbb/detail/unique.inl>
 
diff --git a/thrust/system/tbb/detail/unique.inl b/thrust/system/tbb/detail/unique.inl
index 4ee3c0d9a..136af897c 100644
--- a/thrust/system/tbb/detail/unique.inl
+++ b/thrust/system/tbb/detail/unique.inl
@@ -21,8 +21,7 @@
 #include <thrust/system/detail/generic/unique.h>
 #include <thrust/pair.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace tbb
@@ -59,8 +58,22 @@ template<typename DerivedPolicy,
 } // end unique_copy()
 
 
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename BinaryPredicate>
+  typename thrust::iterator_traits<ForwardIterator>::difference_type
+    unique_count(execution_policy<DerivedPolicy> &exec,
+                 ForwardIterator first,
+                 ForwardIterator last,
+                 BinaryPredicate binary_pred)
+{
+  // tbb prefers generic::unique_count to cpp::unique_count
+  return thrust::system::detail::generic::unique_count(exec,first,last,binary_pred);
+} // end unique_count()
+
+
 } // end namespace detail
 } // end namespace tbb 
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/tbb/detail/unique_by_key.h b/thrust/system/tbb/detail/unique_by_key.h
index 6ab857840..513bb386e 100644
--- a/thrust/system/tbb/detail/unique_by_key.h
+++ b/thrust/system/tbb/detail/unique_by_key.h
@@ -20,8 +20,7 @@
 #include <thrust/system/tbb/detail/execution_policy.h>
 #include <thrust/pair.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace tbb
@@ -61,7 +60,7 @@ template<typename DerivedPolicy,
 } // end namespace detail
 } // end namespace tbb 
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/tbb/detail/unique_by_key.inl>
 
diff --git a/thrust/system/tbb/detail/unique_by_key.inl b/thrust/system/tbb/detail/unique_by_key.inl
index 9c1a150e1..dbd5922b0 100644
--- a/thrust/system/tbb/detail/unique_by_key.inl
+++ b/thrust/system/tbb/detail/unique_by_key.inl
@@ -21,8 +21,7 @@
 #include <thrust/system/detail/generic/unique_by_key.h>
 #include <thrust/pair.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace tbb
@@ -70,5 +69,5 @@ template<typename DerivedPolicy,
 } // end namespace detail
 } // end namespace tbb
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/tbb/detail/vector.inl b/thrust/system/tbb/detail/vector.inl
deleted file mode 100644
index fe9d72ab0..000000000
--- a/thrust/system/tbb/detail/vector.inl
+++ /dev/null
@@ -1,126 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/tbb/vector.h>
-#include <utility>
-
-namespace thrust
-{
-namespace system
-{
-namespace tbb
-{
-
-template<typename T, typename Allocator>
-  vector<T,Allocator>
-    ::vector()
-      : super_t()
-{}
-
-template<typename T, typename Allocator>
-  vector<T,Allocator>
-    ::vector(size_type n)
-      : super_t(n)
-{}
-
-template<typename T, typename Allocator>
-  vector<T,Allocator>
-    ::vector(size_type n, const value_type &value)
-      : super_t(n,value)
-{}
-
-template<typename T, typename Allocator>
-  vector<T,Allocator>
-    ::vector(const vector &x)
-      : super_t(x)
-  {}
-
-#if __cplusplus >= 201103L
-  template<typename T, typename Allocator>
-    vector<T,Allocator>
-      ::vector(vector &&x)
-        : super_t(std::move(x))
-  {}
-#endif
-
-template<typename T, typename Allocator>
-  template<typename OtherT, typename OtherAllocator>
-    vector<T,Allocator>
-      ::vector(const thrust::detail::vector_base<OtherT,OtherAllocator> &x)
-        : super_t(x)
-{}
-
-template<typename T, typename Allocator>
-  template<typename OtherT, typename OtherAllocator>
-    vector<T,Allocator>
-      ::vector(const std::vector<OtherT,OtherAllocator> &x)
-        : super_t(x)
-{}
-
-template<typename T, typename Allocator>
-  template<typename InputIterator>
-    vector<T,Allocator>
-      ::vector(InputIterator first, InputIterator last)
-        : super_t(first,last)
-{}
-
-template<typename T, typename Allocator>
-  vector<T,Allocator> &
-    vector<T,Allocator>
-      ::operator=(const vector &x)
-{
-  super_t::operator=(x);
-  return *this;
-}
-
-#if __cplusplus >= 201103L
-  template<typename T, typename Allocator>
-    vector<T,Allocator> &
-      vector<T,Allocator>
-        ::operator=(vector &&x)
-  {
-    super_t::operator=(std::move(x));
-    return *this;
-  }
-#endif
-
-template<typename T, typename Allocator>
-  template<typename OtherT, typename OtherAllocator>
-    vector<T,Allocator> &
-      vector<T,Allocator>
-        ::operator=(const std::vector<OtherT,OtherAllocator> &x)
-{
-  super_t::operator=(x);
-  return *this;
-}
-
-template<typename T, typename Allocator>
-  template<typename OtherT, typename OtherAllocator>
-    vector<T,Allocator> &
-      vector<T,Allocator>
-        ::operator=(const thrust::detail::vector_base<OtherT,OtherAllocator> &x)
-{
-  super_t::operator=(x);
-  return *this;
-}
-    
-} // end tbb
-} // end system
-} // end thrust
-
diff --git a/thrust/system/tbb/execution_policy.h b/thrust/system/tbb/execution_policy.h
index 18f68bfdc..bfa6b7893 100644
--- a/thrust/system/tbb/execution_policy.h
+++ b/thrust/system/tbb/execution_policy.h
@@ -76,8 +76,7 @@
 // define these entities here for the purpose of Doxygenating them
 // they are actually defined elsewhere
 #if 0
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace tbb
@@ -150,7 +149,7 @@ static const unspecified par;
 
 } // end tbb
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 #endif
 
 
diff --git a/thrust/system/tbb/memory.h b/thrust/system/tbb/memory.h
index 7e801e13a..3bd442232 100644
--- a/thrust/system/tbb/memory.h
+++ b/thrust/system/tbb/memory.h
@@ -2,7 +2,7 @@
  *  Copyright 2008-2018 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
+ *  you may not use this file except in ctbbliance with the License.
  *  You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
@@ -27,8 +27,7 @@
 #include <thrust/mr/allocator.h>
 #include <ostream>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace tbb
@@ -67,83 +66,38 @@ inline pointer<T> malloc(std::size_t n);
  */
 inline void free(pointer<void> ptr);
 
-// XXX upon c++11
-// template<typename T>
-// using allocator = thrust::mr::stateless_resource_allocator<T, memory_resource>;
-
-/*! \p tbb::allocator is the default allocator used by the \p tbb system's containers such as
- *  <tt>tbb::vector</tt> if no user-specified allocator is provided. \p tbb::allocator allocates
- *  (deallocates) storage with \p tbb::malloc (\p tbb::free).
+/*! \p tbb::allocator is the default allocator used by the \p tbb system's
+ *  containers such as <tt>tbb::vector</tt> if no user-specified allocator is
+ *  provided. \p tbb::allocator allocates (deallocates) storage with \p
+ *  tbb::malloc (\p tbb::free).
  */
 template<typename T>
-  struct allocator
-    : thrust::mr::stateless_resource_allocator<
-        T,
-        memory_resource
-    >
-{
-private:
-    typedef thrust::mr::stateless_resource_allocator<
-        T,
-        memory_resource
-    > base;
-
-public:
-  /*! The \p rebind metafunction provides the type of an \p allocator
-   *  instantiated with another type.
-   *
-   *  \tparam U The other type to use for instantiation.
-   */
-  template<typename U>
-    struct rebind
-  {
-    /*! The typedef \p other gives the type of the rebound \p allocator.
-     */
-    typedef allocator<U> other;
-  };
-
-  /*! No-argument constructor has no effect.
-   */
-  __host__ __device__
-  inline allocator() {}
-
-  /*! Copy constructor has no effect.
-   */
-  __host__ __device__
-  inline allocator(const allocator & other) : base(other) {}
+using allocator = thrust::mr::stateless_resource_allocator<
+  T, thrust::system::tbb::memory_resource
+>;
 
-  /*! Constructor from other \p allocator has no effect.
-   */
-  template<typename U>
-  __host__ __device__
-  inline allocator(const allocator<U> & other) : base(other) {}
-
-  /*! Destructor has no effect.
-   */
-  __host__ __device__
-  inline ~allocator() {}
-}; // end allocator
-
-} // end tbb
-
-/*! \}
+/*! \p tbb::universal_allocator allocates memory that can be used by the \p tbb
+ *  system and host systems.
  */
+template<typename T>
+using universal_allocator = thrust::mr::stateless_resource_allocator<
+  T, thrust::system::tbb::universal_memory_resource
+>;
 
-} // end system
+}} // namespace system::tbb
 
 /*! \namespace thrust::tbb
  *  \brief \p thrust::tbb is a top-level alias for thrust::system::tbb.
  */
 namespace tbb
 {
-
 using thrust::system::tbb::malloc;
 using thrust::system::tbb::free;
 using thrust::system::tbb::allocator;
+using thrust::system::tbb::universal_allocator;
+} // namsespace tbb
 
-} // end tbb
-
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/tbb/detail/memory.inl>
 
diff --git a/thrust/system/tbb/memory_resource.h b/thrust/system/tbb/memory_resource.h
index de664eb93..a698b9242 100644
--- a/thrust/system/tbb/memory_resource.h
+++ b/thrust/system/tbb/memory_resource.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2018 NVIDIA Corporation
+ *  Copyright 2018-2020 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -26,11 +26,8 @@
 
 #include <thrust/system/tbb/pointer.h>
 
-namespace thrust
-{
-namespace system
-{
-namespace tbb
+THRUST_NAMESPACE_BEGIN
+namespace system { namespace tbb
 {
 
 //! \cond
@@ -40,24 +37,33 @@ namespace detail
         thrust::mr::new_delete_resource,
         thrust::tbb::pointer<void>
     > native_resource;
-}
+
+    typedef thrust::mr::fancy_pointer_resource<
+        thrust::mr::new_delete_resource,
+        thrust::tbb::universal_pointer<void>
+    > universal_native_resource;
+} // namespace detail
 //! \endcond
 
 /*! \addtogroup memory_resources Memory Resources
- *  \ingroup memory_management_classes
+ *  \ingroup memory_management
  *  \{
  */
 
-/*! The memory resource for the TBB system. Uses \p mr::new_delete_resource and tags it with \p tbb::pointer. */
+/*! The memory resource for the TBB system. Uses \p mr::new_delete_resource and
+ *  tags it with \p tbb::pointer.
+ */
 typedef detail::native_resource memory_resource;
-/*! An alias for \p tbb::memory_resource. */
-typedef detail::native_resource universal_memory_resource;
-/*! An alias for \p tbb::memory_resource. */
+/*! The unified memory resource for the TBB system. Uses
+ *  \p mr::new_delete_resource and tags it with \p tbb::universal_pointer.
+ */
+typedef detail::universal_native_resource universal_memory_resource;
+/*! An alias for \p tbb::universal_memory_resource. */
 typedef detail::native_resource universal_host_pinned_memory_resource;
 
-/*! \}
+/*! \} // memory_resources
  */
 
-}
-}
-}
+}} // namespace system::tbb
+
+THRUST_NAMESPACE_END
diff --git a/thrust/system/tbb/pointer.h b/thrust/system/tbb/pointer.h
index d2912508a..065e1a548 100644
--- a/thrust/system/tbb/pointer.h
+++ b/thrust/system/tbb/pointer.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2018 NVIDIA Corporation
+ *  Copyright 2008-2020 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -14,114 +14,37 @@
  *  limitations under the License.
  */
 
+/*! \file thrust/system/tbb/memory.h
+ *  \brief Managing memory associated with Thrust's TBB system.
+ */
+
+#pragma once
+
 #include <thrust/detail/config.h>
+#include <type_traits>
 #include <thrust/system/tbb/detail/execution_policy.h>
-#include <thrust/detail/type_traits.h>
 #include <thrust/detail/pointer.h>
 #include <thrust/detail/reference.h>
 
-namespace thrust
-{
-namespace system
-{
-namespace tbb
-{
-
-template<typename> class pointer;
-
-} // end tbb
-} // end system
-} // end thrust
-
-
-/*! \cond
- */
-
-// specialize thrust::iterator_traits to avoid problems with the name of
-// pointer's constructor shadowing its nested pointer type
-// do this before pointer is defined so the specialization is correctly
-// used inside the definition
-namespace thrust
-{
-
-template<typename Element>
-  struct iterator_traits<thrust::system::tbb::pointer<Element> >
-{
-  private:
-    typedef thrust::system::tbb::pointer<Element> ptr;
-
-  public:
-    typedef typename ptr::iterator_category       iterator_category;
-    typedef typename ptr::value_type              value_type;
-    typedef typename ptr::difference_type         difference_type;
-    typedef ptr                                   pointer;
-    typedef typename ptr::reference               reference;
-}; // end iterator_traits
-
-} // end thrust
-
-/*! \endcond
- */
-
-
-namespace thrust
+THRUST_NAMESPACE_BEGIN
+namespace system { namespace tbb
 {
-namespace system
-{
-
-/*! \addtogroup system_backends Systems
- *  \ingroup system
- *  \{
- */
-
-/*! \namespace thrust::system::tbb
- *  \brief \p thrust::system::tbb is the namespace containing functionality for allocating, manipulating,
- *         and deallocating memory available to Thrust's TBB backend system.
- *         The identifiers are provided in a separate namespace underneath <tt>thrust::system</tt>
- *         for import convenience but are also aliased in the top-level <tt>thrust::tbb</tt>
- *         namespace for easy access.
- *
- */
-namespace tbb
-{
-
-// forward declaration of reference for pointer
-template<typename Element> class reference;
-
-/*! \cond
- */
-
-// XXX nvcc + msvc have trouble instantiating reference below
-//     this is a workaround
-namespace detail
-{
-
-template<typename Element>
-  struct reference_msvc_workaround
-{
-  typedef thrust::system::tbb::reference<Element> type;
-}; // end reference_msvc_workaround
-
-} // end detail
-
-/*! \endcond
- */
 
-
-/*! \p pointer stores a pointer to an object allocated in memory available to the tbb system.
- *  This type provides type safety when dispatching standard algorithms on ranges resident
- *  in tbb memory.
+/*! \p tbb::pointer stores a pointer to an object allocated in memory accessible
+ *  by the \p tbb system. This type provides type safety when dispatching
+ *  algorithms on ranges resident in \p tbb memory.
  *
- *  \p pointer has pointer semantics: it may be dereferenced and manipulated with pointer arithmetic.
+ *  \p tbb::pointer has pointer semantics: it may be dereferenced and
+ *  manipulated with pointer arithmetic.
  *
- *  \p pointer can be created with the function \p tbb::malloc, or by explicitly calling its constructor
- *  with a raw pointer.
+ *  \p tbb::pointer can be created with the function \p tbb::malloc, or by
+ *  explicitly calling its constructor with a raw pointer.
  *
- *  The raw pointer encapsulated by a \p pointer may be obtained by eiter its <tt>get</tt> member function
- *  or the \p raw_pointer_cast function.
+ *  The raw pointer encapsulated by a \p tbb::pointer may be obtained by eiter its
+ *  <tt>get</tt> member function or the \p raw_pointer_cast function.
  *
- *  \note \p pointer is not a "smart" pointer; it is the programmer's responsibility to deallocate memory
- *  pointed to by \p pointer.
+ *  \note \p tbb::pointer is not a "smart" pointer; it is the programmer's
+ *        responsibility to deallocate memory pointed to by \p tbb::pointer.
  *
  *  \tparam T specifies the type of the pointee.
  *
@@ -129,226 +52,66 @@ template<typename Element>
  *  \see tbb::free
  *  \see raw_pointer_cast
  */
-template<typename T>
-  class pointer
-    : public thrust::pointer<
-               T,
-               thrust::system::tbb::tag,
-               thrust::system::tbb::reference<T>,
-               thrust::system::tbb::pointer<T>
-             >
-{
-  /*! \cond
-   */
-
-  private:
-    typedef thrust::pointer<
-      T,
-      thrust::system::tbb::tag,
-      //thrust::system::tbb::reference<T>,
-      typename detail::reference_msvc_workaround<T>::type,
-      thrust::system::tbb::pointer<T>
-    > super_t;
-
-  /*! \endcond
-   */
-
-  public:
-    // note that tbb::pointer's member functions need __host__ __device__
-    // to interoperate with nvcc + iterators' dereference member function
-
-    /*! \p pointer's no-argument constructor initializes its encapsulated pointer to \c 0.
-     */
-    __host__ __device__
-    pointer() : super_t() {}
-
-    #if THRUST_CPP_DIALECT >= 2011
-    // NOTE: This is needed so that Thrust smart pointers can be used in
-    // `std::unique_ptr`.
-    __host__ __device__
-    pointer(decltype(nullptr)) : super_t(nullptr) {}
-    #endif
-
-    /*! This constructor allows construction of a <tt>pointer<const T></tt> from a <tt>T*</tt>.
-     *
-     *  \param ptr A raw pointer to copy from, presumed to point to a location in memory
-     *         accessible by the \p tbb system.
-     *  \tparam OtherT \p OtherT shall be convertible to \p T.
-     */
-    template<typename OtherT>
-    __host__ __device__
-    explicit pointer(OtherT *ptr) : super_t(ptr) {}
-
-    /*! This constructor allows construction from another pointer-like object with related type.
-     *
-     *  \param other The \p OtherPointer to copy.
-     *  \tparam OtherPointer The system tag associated with \p OtherPointer shall be convertible
-     *          to \p thrust::system::tbb::tag and its element type shall be convertible to \p T.
-     */
-    template<typename OtherPointer>
-    __host__ __device__
-    pointer(const OtherPointer &other,
-            typename thrust::detail::enable_if_pointer_is_convertible<
-              OtherPointer,
-              pointer
-            >::type * = 0) : super_t(other) {}
-
-    /*! This constructor allows construction from another pointer-like object with \p void type.
-     *
-     *  \param other The \p OtherPointer to copy.
-     *  \tparam OtherPointer The system tag associated with \p OtherPointer shall be convertible
-     *          to \p thrust::system::tbb::tag and its element type shall be \p void.
-     */
-    template<typename OtherPointer>
-    __host__ __device__
-    explicit
-    pointer(const OtherPointer &other,
-            typename thrust::detail::enable_if_void_pointer_is_system_convertible<
-              OtherPointer,
-              pointer
-            >::type * = 0) : super_t(other) {}
-
-    /*! Assignment operator allows assigning from another pointer-like object with related type.
-     *
-     *  \param other The other pointer-like object to assign from.
-     *  \tparam OtherPointer The system tag associated with \p OtherPointer shall be convertible
-     *          to \p thrust::system::tbb::tag and its element type shall be convertible to \p T.
-     */
-    template<typename OtherPointer>
-    __host__ __device__
-    typename thrust::detail::enable_if_pointer_is_convertible<
-      OtherPointer,
-      pointer,
-      pointer &
-    >::type
-    operator=(const OtherPointer &other)
-    {
-      return super_t::operator=(other);
-    }
-
-    #if THRUST_CPP_DIALECT >= 2011
-    // NOTE: This is needed so that Thrust smart pointers can be used in
-    // `std::unique_ptr`.
-    __host__ __device__
-    pointer& operator=(decltype(nullptr))
-    {
-      super_t::operator=(nullptr);
-      return *this;
-    }
-    #endif
-}; // end pointer
-
-
-/*! \p reference is a wrapped reference to an object stored in memory available to the \p tbb system.
- *  \p reference is the type of the result of dereferencing a \p tbb::pointer.
+template <typename T>
+using pointer = thrust::pointer<
+  T,
+  thrust::system::tbb::tag,
+  thrust::tagged_reference<T, thrust::system::tbb::tag>
+>;
+
+/*! \p tbb::universal_pointer stores a pointer to an object allocated in memory
+ * accessible by the \p tbb system and host systems.
  *
- *  \tparam T Specifies the type of the referenced object.
+ *  \p tbb::universal_pointer has pointer semantics: it may be dereferenced and
+ *  manipulated with pointer arithmetic.
+ *
+ *  \p tbb::universal_pointer can be created with \p tbb::universal_allocator
+ *  or by explicitly calling its constructor with a raw pointer.
+ *
+ *  The raw pointer encapsulated by a \p tbb::universal_pointer may be obtained
+ *  by eiter its <tt>get</tt> member function or the \p raw_pointer_cast
+ *  function.
+ *
+ *  \note \p tbb::universal_pointer is not a "smart" pointer; it is the
+ *        programmer's responsibility to deallocate memory pointed to by
+ *        \p tbb::universal_pointer.
+ *
+ *  \tparam T specifies the type of the pointee.
+ *
+ *  \see tbb::universal_allocator
+ *  \see raw_pointer_cast
  */
-template<typename T>
-  class reference
-    : public thrust::reference<
-               T,
-               thrust::system::tbb::pointer<T>,
-               thrust::system::tbb::reference<T>
-             >
-{
-  /*! \cond
-   */
-
-  private:
-    typedef thrust::reference<
-      T,
-      thrust::system::tbb::pointer<T>,
-      thrust::system::tbb::reference<T>
-    > super_t;
-
-  /*! \endcond
-   */
-
-  public:
-    /*! \cond
-     */
-
-    typedef typename super_t::value_type value_type;
-    typedef typename super_t::pointer    pointer;
-
-    /*! \endcond
-     */
-
-    /*! This constructor initializes this \p reference to refer to an object
-     *  pointed to by the given \p pointer. After this \p reference is constructed,
-     *  it shall refer to the object pointed to by \p ptr.
-     *
-     *  \param ptr A \p pointer to copy from.
-     */
-    __host__ __device__
-    explicit reference(const pointer &ptr)
-      : super_t(ptr)
-    {}
-
-    /*! This constructor accepts a const reference to another \p reference of related type.
-     *  After this \p reference is constructed, it shall refer to the same object as \p other.
-     *
-     *  \param other A \p reference to copy from.
-     *  \tparam OtherT The element type of the other \p reference.
-     *
-     *  \note This constructor is templated primarily to allow initialization of <tt>reference<const T></tt>
-     *        from <tt>reference<T></tt>.
-     */
-    template<typename OtherT>
-    __host__ __device__
-    reference(const reference<OtherT> &other,
-              typename thrust::detail::enable_if_convertible<
-                typename reference<OtherT>::pointer,
-                pointer
-              >::type * = 0)
-      : super_t(other)
-    {}
-
-    /*! Copy assignment operator copy assigns from another \p reference of related type.
-     *
-     *  \param other The other \p reference to assign from.
-     *  \return <tt>*this</tt>
-     *  \tparam OtherT The element type of the other \p reference.
-     */
-    template<typename OtherT>
-    reference &operator=(const reference<OtherT> &other);
-
-    /*! Assignment operator assigns from a \p value_type.
-     *
-     *  \param x The \p value_type to assign from.
-     *  \return <tt>*this</tt>
-     */
-    reference &operator=(const value_type &x);
-}; // end reference
-
-/*! Exchanges the values of two objects referred to by \p reference.
- *  \p x The first \p reference of interest.
- *  \p y The second \p reference ot interest.
+template <typename T>
+using universal_pointer = thrust::pointer<
+  T,
+  thrust::system::tbb::tag,
+  typename std::add_lvalue_reference<T>::type
+>;
+
+/*! \p reference is a wrapped reference to an object stored in memory available
+ *  to the \p tbb system. \p reference is the type of the result of
+ *  dereferencing a \p tbb::pointer.
+ *
+ *  \tparam T Specifies the type of the referenced object.
  */
-template<typename T>
-__host__ __device__
-void swap(reference<T> x, reference<T> y);
+template <typename T>
+using reference = thrust::tagged_reference<T, thrust::system::tbb::tag>;
 
-} // end tbb
+}} // namespace system::tbb
 
-/*! \}
+/*! \addtogroup system_backends Systems
+ *  \ingroup system
+ *  \{
  */
 
-} // end system
-
 /*! \namespace thrust::tbb
- *  \brief \p thrust::tbb is a top-level alias for thrust::system::tbb.
- */
+ *  \brief \p thrust::tbb is a top-level alias for \p thrust::system::tbb. */
 namespace tbb
 {
-
 using thrust::system::tbb::pointer;
+using thrust::system::tbb::universal_pointer;
 using thrust::system::tbb::reference;
+} // namespace tbb
 
-} // end tbb
-
-} // end thrust
-
-#include <thrust/system/tbb/detail/pointer.inl>
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/tbb/vector.h b/thrust/system/tbb/vector.h
index 918e929b0..8cbbabbd2 100644
--- a/thrust/system/tbb/vector.h
+++ b/thrust/system/tbb/vector.h
@@ -26,140 +26,57 @@
 #include <thrust/detail/vector_base.h>
 #include <vector>
 
-namespace thrust
+THRUST_NAMESPACE_BEGIN
+namespace system { namespace tbb
 {
-namespace system
-{
-namespace tbb
-{
-
-// XXX upon c++11
-// template<typename T, typename Allocator = allocator<T> > using vector = thrust::detail::vector_base<T,Allocator>;
 
 /*! \p tbb::vector is a container that supports random access to elements,
  *  constant time removal of elements at the end, and linear time insertion
  *  and removal of elements at the beginning or in the middle. The number of
  *  elements in a \p tbb::vector may vary dynamically; memory management is
  *  automatic. The elements contained in a \p tbb::vector reside in memory
- *  available to the \p tbb system.
+ *  accessible by the \p tbb system.
  *
  *  \tparam T The element type of the \p tbb::vector.
- *  \tparam Allocator The allocator type of the \p tbb::vector. Defaults to \p tbb::allocator.
+ *  \tparam Allocator The allocator type of the \p tbb::vector.
+ *          Defaults to \p tbb::allocator.
  *
- *  \see http://www.sgi.com/tech/stl/Vector.html
+ *  \see https://en.cppreference.com/w/cpp/container/vector
  *  \see host_vector For the documentation of the complete interface which is
- *                   shared by \p tbb::vector
+ *                   shared by \p tbb::vector.
  *  \see device_vector
+ *  \see universal_vector
  */
-template<typename T, typename Allocator = allocator<T> >
-  class vector
-    : public thrust::detail::vector_base<T,Allocator>
-{
-  /*! \cond
-   */
-  private:
-    typedef thrust::detail::vector_base<T,Allocator> super_t;
-  /*! \endcond
-   */
-
-  public:
-
-  /*! \cond
-   */
-    typedef typename super_t::size_type  size_type;
-    typedef typename super_t::value_type value_type;
-  /*! \endcond
-   */
-
-    /*! This constructor creates an empty \p tbb::vector.
-     */
-    vector();
-
-    /*! This constructor creates a \p tbb::vector with \p n default-constructed elements.
-     *  \param n The size of the \p tbb::vector to create.
-     */
-    explicit vector(size_type n);
-
-    /*! This constructor creates a \p tbb::vector with \p n copies of \p value.
-     *  \param n The size of the \p tbb::vector to create.
-     *  \param value An element to copy.
-     */
-    explicit vector(size_type n, const value_type &value);
-
-    /*! Copy constructor copies from another \p tbb::vector.
-     *  \param x The other \p tbb::vector to copy.
-     */
-    vector(const vector &x);
-    
-  #if __cplusplus >= 201103L
-    /*! Move constructor use the move semantic over another \p tbb::vector.
-     *  \param x The other \p tbb::vector to move from.
-     */
-    vector(vector &&x);
-  #endif
-
-    /*! This constructor copies from another Thrust vector-like object.
-     *  \param x The other object to copy from.
-     */
-    template<typename OtherT, typename OtherAllocator>
-    vector(const thrust::detail::vector_base<OtherT,OtherAllocator> &x);
-
-    /*! This constructor copies from a \c std::vector.
-     *  \param x The \c std::vector to copy from.
-     */
-    template<typename OtherT, typename OtherAllocator>
-    vector(const std::vector<OtherT,OtherAllocator> &x);
-
-    /*! This constructor creates a \p tbb::vector by copying from a range.
-     *  \param first The beginning of the range.
-     *  \param last The end of the range.
-     */
-    template<typename InputIterator>
-    vector(InputIterator first, InputIterator last);
-
-    // XXX vector_base should take a Derived type so we don't have to define these superfluous assigns
-
-    /*! Assignment operator assigns from another \p tbb::vector.
-     *  \param x The other object to assign from.
-     *  \return <tt>*this</tt>
-     */
-    vector &operator=(const vector &x);
-
-  #if __cplusplus >= 201103L
-    /*! Move assignment operator use move semantic over another \p tbb::vector.
-     *  \param x The other \p tbb::vector to move from.
-     *  \return <tt>*this</tt>
-     */
-     vector &operator=(vector &&x);
-  #endif
-
-    /*! Assignment operator assigns from a \c std::vector.
-     *  \param x The \c std::vector to assign from.
-     *  \return <tt>*this</tt>
-     */
-    template<typename OtherT, typename OtherAllocator>
-    vector &operator=(const std::vector<OtherT,OtherAllocator> &x);
-
-    /*! Assignment operator assigns from another Thrust vector-like object.
-     *  \param x The other object to assign from.
-     *  \return <tt>*this</tt>
-     */
-    template<typename OtherT, typename OtherAllocator>
-    vector &operator=(const thrust::detail::vector_base<OtherT,OtherAllocator> &x);
-}; // end vector
+template <typename T, typename Allocator = thrust::system::tbb::allocator<T>>
+using vector = thrust::detail::vector_base<T, Allocator>;
+
+/*! \p tbb::universal_vector is a container that supports random access to
+ *  elements, constant time removal of elements at the end, and linear time
+ *  insertion and removal of elements at the beginning or in the middle. The
+ *  number of elements in a \p tbb::universal_vector may vary dynamically;
+ *  memory management is automatic. The elements contained in a
+ *  \p tbb::universal_vector reside in memory accessible by the \p tbb system
+ *  and host systems.
+ *
+ *  \tparam T The element type of the \p tbb::universal_vector.
+ *  \tparam Allocator The allocator type of the \p tbb::universal_vector.
+ *          Defaults to \p tbb::universal_allocator.
+ *
+ *  \see https://en.cppreference.com/w/cpp/container/vector
+ *  \see host_vector For the documentation of the complete interface which is
+ *                   shared by \p tbb::universal_vector
+ *  \see device_vector
+ *  \see universal_vector
+ */
+template <typename T, typename Allocator = thrust::system::tbb::universal_allocator<T>>
+using universal_vector = thrust::detail::vector_base<T, Allocator>;
 
-} // end tbb
-} // end system
+}} // namespace system::tbb
 
-// alias system::tbb names at top-level
 namespace tbb
 {
-
 using thrust::system::tbb::vector;
+using thrust::system::tbb::universal_vector;
+}
 
-} // end tbb
-
-} // end thrust
-
-#include <thrust/system/tbb/detail/vector.inl>
-
+THRUST_NAMESPACE_END
diff --git a/thrust/system_error.h b/thrust/system_error.h
index 7119ac4b6..6bf240e51 100644
--- a/thrust/system_error.h
+++ b/thrust/system_error.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2021 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -22,19 +22,18 @@
 
 #include <thrust/detail/config.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup system
  *  \{
  */
 
 /*! \namespace thrust::system
- *  \brief \p thrust::system is the namespace which contains functionality for manipulating
- *         memory specific to one of Thrust's backend systems. It also contains functionality
- *         for reporting error conditions originating from the operating system or other
- *         low-level application program interfaces such as the CUDA runtime.
- *         They are provided in a separate namespace for import convenience but are
+ *  \brief \p thrust::system is the namespace which contains specific Thrust
+ *         backend systems. It also contains functionality for reporting error
+ *         conditions originating from the operating system or other low-level
+ *         application program interfaces such as the CUDA runtime. They are
+ *         provided in a separate namespace for import convenience but are
  *         also aliased in the top-level \p thrust namespace for easy access.
  */
 namespace system
@@ -44,8 +43,7 @@ namespace system
 /*! \} // end system
  */
 
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/error_code.h>
 #include <thrust/system/system_error.h>
-
diff --git a/thrust/tabulate.h b/thrust/tabulate.h
index 1dcd2c9ee..7cb794550 100644
--- a/thrust/tabulate.h
+++ b/thrust/tabulate.h
@@ -24,9 +24,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/detail/execution_policy.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup transformations
  *  \{
@@ -47,11 +45,11 @@ namespace thrust
  *  \param unary_op The unary operation to apply.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator is mutable,
  *          and if \c x and \c y are objects of \c ForwardIterator's \c value_type, then <tt>x + y</tt> is defined,
  *          and if \c T is \p ForwardIterator's \c value_type, then <tt>T(0)</tt> is defined.
- *  \tparam UnaryOperation is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction.html">Unary Function</a>
+ *  \tparam UnaryOperation is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/unary_function">Unary Function</a>
  *                         and \c UnaryFunction's \c result_type is convertible to \c OutputIterator's \c value_type.
  *
  *  The following code snippet demonstrates how to use \p tabulate to generate the first \c n non-positive integers
@@ -90,11 +88,11 @@ __host__ __device__
  *  \param last The end of the range.
  *  \param unary_op The unary operation to apply.
  *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator is mutable,
  *          and if \c x and \c y are objects of \c ForwardIterator's \c value_type, then <tt>x + y</tt> is defined,
  *          and if \c T is \p ForwardIterator's \c value_type, then <tt>T(0)</tt> is defined.
- *  \tparam UnaryOperation is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction.html">Unary Function</a>
+ *  \tparam UnaryOperation is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/unary_function">Unary Function</a>
  *                         and \c UnaryFunction's \c result_type is convertible to \c OutputIterator's \c value_type.
  *
  *  The following code snippet demonstrates how to use \p tabulate to generate the first \c n non-positive integers:
@@ -122,8 +120,6 @@ template<typename ForwardIterator, typename UnaryOperation>
 /*! \} // end transformations
  */
 
-
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/tabulate.inl>
-
diff --git a/thrust/transform.h b/thrust/transform.h
index cefca409a..2d064c13b 100644
--- a/thrust/transform.h
+++ b/thrust/transform.h
@@ -24,9 +24,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/detail/execution_policy.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup algorithms
  */
@@ -52,14 +50,14 @@ namespace thrust
  *  \param first The beginning of the input sequence.
  *  \param last The end of the input sequence.
  *  \param result The beginning of the output sequence.
- *  \param op The tranformation operation.
+ *  \param op The transformation operation.
  *  \return The end of the output sequence.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                        and \c InputIterator's \c value_type is convertible to \c UnaryFunction's \c argument_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction.html">Unary Function</a>
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam UnaryFunction is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/unary_function">Unary Function</a>
  *                              and \c UnaryFunction's \c result_type is convertible to \c OutputIterator's \c value_type.
  *
  *  \pre \p first may equal \p result, but the range <tt>[first, last)</tt> shall not overlap the range <tt>[result, result + (last - first))</tt> otherwise.
@@ -82,7 +80,7 @@ namespace thrust
  *  // data is now {5, 0, -2, 3, -2, -4, 0, 1, -2, -8};
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/transform.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/transform
  */
 template<typename DerivedPolicy,
          typename InputIterator,
@@ -110,10 +108,10 @@ __host__ __device__
  *  \param op The tranformation operation.
  *  \return The end of the output sequence.
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                        and \c InputIterator's \c value_type is convertible to \c UnaryFunction's \c argument_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction.html">Unary Function</a>
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam UnaryFunction is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/unary_function">Unary Function</a>
  *                              and \c UnaryFunction's \c result_type is convertible to \c OutputIterator's \c value_type.
  *
  *  \pre \p first may equal \p result, but the range <tt>[first, last)</tt> shall not overlap the range <tt>[result, result + (last - first))</tt> otherwise.
@@ -133,7 +131,7 @@ __host__ __device__
  *  // data is now {5, 0, -2, 3, -2, -4, 0, 1, -2, -8};
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/transform.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/transform
  */
 template<typename InputIterator,
          typename OutputIterator,
@@ -165,12 +163,12 @@ template<typename InputIterator,
  *  \return The end of the output sequence.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                        and \c InputIterator1's \c value_type is convertible to \c BinaryFunction's \c first_argument_type.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                        and \c InputIterator2's \c value_type is convertible to \c BinaryFunction's \c second_argument_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam BinaryFunction is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam BinaryFunction is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/binary_function">Binary Function</a>
  *                              and \c BinaryFunction's \c result_type is convertible to \c OutputIterator's \c value_type.
  *
  *  \pre \p first1 may equal \p result, but the range <tt>[first1, last1)</tt> shall not overlap the range <tt>[result, result + (last1 - first1))</tt> otherwise.
@@ -196,7 +194,7 @@ template<typename InputIterator,
  *  // output is now {-2,  6,  0,  4,  4,  7};
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/transform.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/transform
  */
 template<typename DerivedPolicy,
          typename InputIterator1,
@@ -229,12 +227,12 @@ __host__ __device__
  *  \param op The tranformation operation.
  *  \return The end of the output sequence.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                        and \c InputIterator1's \c value_type is convertible to \c BinaryFunction's \c first_argument_type.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                        and \c InputIterator2's \c value_type is convertible to \c BinaryFunction's \c second_argument_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam BinaryFunction is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam BinaryFunction is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/binary_function">Binary Function</a>
  *                              and \c BinaryFunction's \c result_type is convertible to \c OutputIterator's \c value_type.
  *
  *  \pre \p first1 may equal \p result, but the range <tt>[first1, last1)</tt> shall not overlap the range <tt>[result, result + (last1 - first1))</tt> otherwise.
@@ -257,7 +255,7 @@ __host__ __device__
  *  // output is now {-2,  6,  0,  4,  4,  7};
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/transform.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/transform
  */
 template<typename InputIterator1,
          typename InputIterator2,
@@ -294,13 +292,13 @@ template<typename InputIterator1,
  *  \return The end of the output sequence.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *                        and \c InputIterator's \c value_type is convertible to \c Predicate's \c argument_type,
  *                        and \c InputIterator's \c value_type is convertible to \c UnaryFunction's \c argument_type.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>.
- *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction.html">Unary Function</a>
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>.
+ *  \tparam UnaryFunction is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/unary_function">Unary Function</a>
  *                        and \c UnaryFunction's \c result_type is convertible to \c OutputIterator's \c value_type.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \pre \p first may equal \p result, but the range <tt>[first, last)</tt> shall not overlap the range <tt>[result, result + (last - first))</tt> otherwise.
  *
@@ -369,13 +367,13 @@ __host__ __device__
  *  \param pred The predicate operation.
  *  \return The end of the output sequence.
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *                        and \c InputIterator's \c value_type is convertible to \c Predicate's \c argument_type,
  *                        and \c InputIterator's \c value_type is convertible to \c UnaryFunction's \c argument_type.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>.
- *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction.html">Unary Function</a>
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>.
+ *  \tparam UnaryFunction is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/unary_function">Unary Function</a>
  *                        and \c UnaryFunction's \c result_type is convertible to \c OutputIterator's \c value_type.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \pre \p first may equal \p result, but the range <tt>[first, last)</tt> shall not overlap the range <tt>[result, result + (last - first))</tt> otherwise.
  *
@@ -444,14 +442,14 @@ template<typename InputIterator,
  *  \return The end of the output sequence.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                         and \c InputIterator1's \c value_type is convertible to \c UnaryFunction's \c argument_type.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                         and \c InputIterator2's \c value_type is convertible to \c Predicate's \c argument_type.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>.
- *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction.html">Unary Function</a>
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>.
+ *  \tparam UnaryFunction is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/unary_function">Unary Function</a>
  *                        and \c UnaryFunction's \c result_type is convertible to \c OutputIterator's \c value_type.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \pre \p first may equal \p result, but the range <tt>[first, last)</tt> shall not overlap the range <tt>[result, result + (last - first))</tt> otherwise.
  *  \pre \p stencil may equal \p result, but the range <tt>[stencil, stencil + (last - first))</tt> shall not overlap the range <tt>[result, result + (last - first))</tt> otherwise.
@@ -516,14 +514,14 @@ __host__ __device__
  *  \param pred The predicate operation.
  *  \return The end of the output sequence.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                         and \c InputIterator1's \c value_type is convertible to \c UnaryFunction's \c argument_type.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                         and \c InputIterator2's \c value_type is convertible to \c Predicate's \c argument_type.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>.
- *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction.html">Unary Function</a>
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>.
+ *  \tparam UnaryFunction is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/unary_function">Unary Function</a>
  *                        and \c UnaryFunction's \c result_type is convertible to \c OutputIterator's \c value_type.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \pre \p first may equal \p result, but the range <tt>[first, last)</tt> shall not overlap the range <tt>[result, result + (last - first))</tt> otherwise.
  *  \pre \p stencil may equal \p result, but the range <tt>[stencil, stencil + (last - first))</tt> shall not overlap the range <tt>[result, result + (last - first))</tt> otherwise.
@@ -588,14 +586,14 @@ template<typename InputIterator1,
  *  \return The end of the output sequence.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                         and \c InputIterator1's \c value_type is convertible to \c BinaryFunction's \c first_argument_type.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                         and \c InputIterator2's \c value_type is convertible to \c BinaryFunction's \c second_argument_type.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>.
- *  \tparam BinaryFunction is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>.
+ *  \tparam BinaryFunction is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/binary_function">Binary Function</a>
  *                         and \c BinaryFunction's \c result_type is convertible to \c OutputIterator's \c value_type.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \pre \p first1 may equal \p result, but the range <tt>[first1, last1)</tt> shall not overlap the range <tt>[result, result + (last1 - first1))</tt> otherwise.
  *  \pre \p first2 may equal \p result, but the range <tt>[first2, first2 + (last1 - first1))</tt> shall not overlap the range <tt>[result, result + (last1 - first1))</tt> otherwise.
@@ -667,14 +665,14 @@ __host__ __device__
  *  \param pred The predicate operation.
  *  \return The end of the output sequence.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                         and \c InputIterator1's \c value_type is convertible to \c BinaryFunction's \c first_argument_type.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                         and \c InputIterator2's \c value_type is convertible to \c BinaryFunction's \c second_argument_type.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>.
- *  \tparam BinaryFunction is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>.
+ *  \tparam BinaryFunction is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/binary_function">Binary Function</a>
  *                         and \c BinaryFunction's \c result_type is convertible to \c OutputIterator's \c value_type.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \pre \p first1 may equal \p result, but the range <tt>[first1, last1)</tt> shall not overlap the range <tt>[result, result + (last1 - first1))</tt> otherwise.
  *  \pre \p first2 may equal \p result, but the range <tt>[first2, first2 + (last1 - first1))</tt> shall not overlap the range <tt>[result, result + (last1 - first1))</tt> otherwise.
@@ -718,8 +716,6 @@ template<typename InputIterator1,
 /*! \} // end transformations
  */
 
-
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/transform.inl>
-
diff --git a/thrust/transform_reduce.h b/thrust/transform_reduce.h
index 32e172d1e..11d6b84c3 100644
--- a/thrust/transform_reduce.h
+++ b/thrust/transform_reduce.h
@@ -24,9 +24,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/detail/execution_policy.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup reductions
  *  \{
@@ -61,13 +59,13 @@ namespace thrust
  *  \return The result of the transformed reduction.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator's \c value_type is convertible to \p UnaryFunction's \c argument_type.
- *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction.html">Unary Function</a>,
+ *  \tparam UnaryFunction is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/unary_function">Unary Function</a>,
  *          and \p UnaryFunction's \c result_type is convertible to \c OutputType.
- *  \tparam OutputType is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *  \tparam OutputType is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>,
  *          and is convertible to \p BinaryFunction's \c first_argument_type and \c second_argument_type.
- *  \tparam BinaryFunction is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>,
+ *  \tparam BinaryFunction is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/binary_function">Binary Function</a>,
  *          and \p BinaryFunction's \c result_type is convertible to \p OutputType.
  *
  *  The following code snippet demonstrates how to use \p transform_reduce
@@ -137,13 +135,13 @@ __host__ __device__
  *  \param binary_op The reduction operation.
  *  \return The result of the transformed reduction.
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator's \c value_type is convertible to \p UnaryFunction's \c argument_type.
- *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction.html">Unary Function</a>,
+ *  \tparam UnaryFunction is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/unary_function">Unary Function</a>,
  *          and \p UnaryFunction's \c result_type is convertible to \c OutputType.
- *  \tparam OutputType is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *  \tparam OutputType is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>,
  *          and is convertible to \p BinaryFunction's \c first_argument_type and \c second_argument_type.
- *  \tparam BinaryFunction is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>,
+ *  \tparam BinaryFunction is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/binary_function">Binary Function</a>,
  *          and \p BinaryFunction's \c result_type is convertible to \p OutputType.
  *
  *  The following code snippet demonstrates how to use \p transform_reduce
@@ -191,8 +189,6 @@ template<typename InputIterator,
  *  \} // end reductions
  */
 
-
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/transform_reduce.inl>
-
diff --git a/thrust/transform_scan.h b/thrust/transform_scan.h
index 8bb883d54..6c0fe8116 100644
--- a/thrust/transform_scan.h
+++ b/thrust/transform_scan.h
@@ -24,9 +24,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/detail/execution_policy.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup algorithms
  */
@@ -64,13 +62,13 @@ namespace thrust
  *  \return The end of the output sequence.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                               and \c InputIterator's \c value_type is convertible to \c unary_op's input type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction.html">Unary Function</a>
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam UnaryFunction is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/unary_function">Unary Function</a>
  *                               and accepts inputs of \c InputIterator's \c value_type.  \c UnaryFunction's result_type
  *                               is convertable to \c OutputIterator's \c value_type.
- *  \tparam AssociativeOperator is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
+ *  \tparam AssociativeOperator is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/binary_function">Binary Function</a>
  *                              and \c AssociativeOperator's \c result_type is
  *                              convertible to \c OutputIterator's \c value_type.
  *
@@ -130,13 +128,13 @@ __host__ __device__
  *  \param binary_op The associatve operator used to 'sum' transformed values.
  *  \return The end of the output sequence.
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                               and \c InputIterator's \c value_type is convertible to \c unary_op's input type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction.html">Unary Function</a>
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam UnaryFunction is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/unary_function">Unary Function</a>
  *                               and accepts inputs of \c InputIterator's \c value_type.  \c UnaryFunction's result_type
  *                               is convertable to \c OutputIterator's \c value_type.
- *  \tparam AssociativeOperator is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
+ *  \tparam AssociativeOperator is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/binary_function">Binary Function</a>
  *                              and \c AssociativeOperator's \c result_type is
  *                              convertible to \c OutputIterator's \c value_type.
  *
@@ -195,14 +193,14 @@ template<typename InputIterator,
  *  \return The end of the output sequence.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                               and \c InputIterator's \c value_type is convertible to \c unary_op's input type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction.html">Unary Function</a>
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam UnaryFunction is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/unary_function">Unary Function</a>
  *                               and accepts inputs of \c InputIterator's \c value_type.  \c UnaryFunction's result_type
  *                               is convertable to \c OutputIterator's \c value_type.
  *  \tparam T is convertible to \c OutputIterator's \c value_type.
- *  \tparam AssociativeOperator is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
+ *  \tparam AssociativeOperator is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/binary_function">Binary Function</a>
  *                              and \c AssociativeOperator's \c result_type is
  *                              convertible to \c OutputIterator's \c value_type.
  *
@@ -265,14 +263,14 @@ __host__ __device__
  *  \param binary_op The associatve operator used to 'sum' transformed values.
  *  \return The end of the output sequence.
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                               and \c InputIterator's \c value_type is convertible to \c unary_op's input type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction.html">Unary Function</a>
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam UnaryFunction is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/unary_function">Unary Function</a>
  *                               and accepts inputs of \c InputIterator's \c value_type.  \c UnaryFunction's result_type
  *                               is convertable to \c OutputIterator's \c value_type.
  *  \tparam T is convertible to \c OutputIterator's \c value_type.
- *  \tparam AssociativeOperator is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
+ *  \tparam AssociativeOperator is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/binary_function">Binary Function</a>
  *                              and \c AssociativeOperator's \c result_type is
  *                              convertible to \c OutputIterator's \c value_type.
  *
@@ -317,8 +315,6 @@ template<typename InputIterator,
 /*! \} // end prefixsums
  */
 
-	
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/transform_scan.inl>
-
diff --git a/thrust/tuple.h b/thrust/tuple.h
index 930f90326..04f3154a3 100644
--- a/thrust/tuple.h
+++ b/thrust/tuple.h
@@ -16,12 +16,12 @@
 
 
 /*! \file tuple.h
- *  \brief A type encapsulating a heterogeneous collection of elements
+ *  \brief A type encapsulating a heterogeneous collection of elements.
  */
 
 /*
  * Copyright (C) 1999, 2000 Jaakko Järvi (jaakko.jarvi@cs.utu.fi)
- * 
+ *
  * Distributed under the Boost Software License, Version 1.0.
  * (See accompanying NOTICE file for the complete license)
  *
@@ -34,8 +34,7 @@
 #include <thrust/detail/tuple.inl>
 #include <thrust/pair.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup utility
  *  \{
@@ -62,17 +61,7 @@ struct null_type;
  *  \see pair
  *  \see tuple
  */
-template<int N, class T>
-  struct tuple_element
-{
-  private:
-    typedef typename T::tail_type Next;
-
-  public:
-    /*! The result of this metafunction is returned in \c type.
-     */
-    typedef typename tuple_element<N-1, Next>::type type;
-}; // end tuple_element
+template <size_t N, class T> struct tuple_element;
 
 /*! This metafunction returns the number of elements
  *  of a \p tuple type of interest.
@@ -82,13 +71,8 @@ template<int N, class T>
  *  \see pair
  *  \see tuple
  */
-template<class T>
-  struct tuple_size
-{
-  /*! The result of this metafunction is returned in \c value.
-   */
-  static const int value = 1 + tuple_size<typename T::tail_type>::value;
-}; // end tuple_size
+template <class T> struct tuple_size;
+
 
 // get function for non-const cons-lists, returns a reference to the element
 
@@ -155,12 +139,12 @@ get(const detail::cons<HT, TT>& t);
 
 
 
-/*! \p tuple is a class template that can be instantiated with up to ten arguments.
- *  Each template argument specifies the type of element in the \p tuple.
- *  Consequently, tuples are heterogeneous, fixed-size collections of values. An
- *  instantiation of \p tuple with two arguments is similar to an instantiation
- *  of \p pair with the same two arguments. Individual elements of a \p tuple may
- *  be accessed with the \p get function.
+/*! \brief \p tuple is a class template that can be instantiated with up to ten
+ *  arguments. Each template argument specifies the type of element in the \p
+ *  tuple. Consequently, tuples are heterogeneous, fixed-size collections of
+ *  values. An instantiation of \p tuple with two arguments is similar to an
+ *  instantiation of \p pair with the same two arguments. Individual elements
+ *  of a \p tuple may be accessed with the \p get function.
  *
  *  \tparam TN The type of the <tt>N</tt> \c tuple element. Thrust's \p tuple
  *          type currently supports up to ten elements.
@@ -171,18 +155,20 @@ get(const detail::cons<HT, TT>& t);
  *  \code
  *  #include <thrust/tuple.h>
  *  #include <iostream>
- *  ...
- *  // create a tuple containing an int, a float, and a string
- *  thrust::tuple<int, float, const char*> t(13, 0.1f, "thrust");
+ *  
+ *  int main() {
+ *    // Create a tuple containing an `int`, a `float`, and a string.
+ *    thrust::tuple<int, float, const char*> t(13, 0.1f, "thrust");
  *
- *  // individual members are accessed with the free function get
- *  std::cout << "The first element's value is " << thrust::get<0>(t) << std::endl; 
+ *    // Individual members are accessed with the free function `get`.
+ *    std::cout << "The first element's value is " << thrust::get<0>(t) << std::endl;
  *
- *  // or the member function get
- *  std::cout << "The second element's value is " << t.get<1>() << std::endl;
+ *    // ... or the member function `get`.
+ *    std::cout << "The second element's value is " << t.get<1>() << std::endl;
  *
- *  // we can also modify elements with the same function
- *  thrust::get<0>(t) += 10;
+ *    // We can also modify elements with the same function.
+ *    thrust::get<0>(t) += 10;
+ *  }
  *  \endcode
  *
  *  \see pair
@@ -194,8 +180,12 @@ get(const detail::cons<HT, TT>& t);
  */
 template <class T0, class T1, class T2, class T3, class T4,
           class T5, class T6, class T7, class T8, class T9>
-  class tuple :
-    public detail::map_tuple_to_cons<T0, T1, T2, T3, T4, T5, T6, T7, T8, T9>::type
+  class tuple
+  /*! \cond
+   */
+    : public detail::map_tuple_to_cons<T0, T1, T2, T3, T4, T5, T6, T7, T8, T9>::type
+  /*! \endcond
+   */
 {
   /*! \cond
    */
@@ -207,6 +197,7 @@ template <class T0, class T1, class T2, class T3, class T4,
    */
 
   public:
+
   /*! \p tuple's no-argument constructor initializes each element.
    */
   inline __host__ __device__
@@ -216,7 +207,7 @@ template <class T0, class T1, class T2, class T3, class T4,
    *     and intializes all other elements.
    *  \param t0 The value to assign to this \p tuple's first element.
    */
-  inline __host__ __device__ 
+  inline __host__ __device__
   tuple(typename access_traits<T0>::parameter_type t0)
     : inherited(t0,
                 static_cast<const null_type&>(null_type()),
@@ -235,7 +226,7 @@ template <class T0, class T1, class T2, class T3, class T4,
    *  \param t1 The value to assign to this \p tuple's second element.
    *  \note \p tuple's constructor has ten variants of this form, the rest of which are ommitted here for brevity.
    */
-  inline __host__ __device__ 
+  inline __host__ __device__
   tuple(typename access_traits<T0>::parameter_type t0,
         typename access_traits<T1>::parameter_type t1)
     : inherited(t0, t1,
@@ -251,7 +242,7 @@ template <class T0, class T1, class T2, class T3, class T4,
   /*! \cond
    */
 
-  inline __host__ __device__ 
+  inline __host__ __device__
   tuple(typename access_traits<T0>::parameter_type t0,
         typename access_traits<T1>::parameter_type t1,
         typename access_traits<T2>::parameter_type t2)
@@ -264,7 +255,7 @@ template <class T0, class T1, class T2, class T3, class T4,
                 static_cast<const null_type&>(null_type()),
                 static_cast<const null_type&>(null_type())) {}
 
-  inline __host__ __device__ 
+  inline __host__ __device__
   tuple(typename access_traits<T0>::parameter_type t0,
         typename access_traits<T1>::parameter_type t1,
         typename access_traits<T2>::parameter_type t2,
@@ -277,7 +268,7 @@ template <class T0, class T1, class T2, class T3, class T4,
                 static_cast<const null_type&>(null_type()),
                 static_cast<const null_type&>(null_type())) {}
 
-  inline __host__ __device__ 
+  inline __host__ __device__
   tuple(typename access_traits<T0>::parameter_type t0,
         typename access_traits<T1>::parameter_type t1,
         typename access_traits<T2>::parameter_type t2,
@@ -290,7 +281,7 @@ template <class T0, class T1, class T2, class T3, class T4,
                 static_cast<const null_type&>(null_type()),
                 static_cast<const null_type&>(null_type())) {}
 
-  inline __host__ __device__ 
+  inline __host__ __device__
   tuple(typename access_traits<T0>::parameter_type t0,
         typename access_traits<T1>::parameter_type t1,
         typename access_traits<T2>::parameter_type t2,
@@ -303,7 +294,7 @@ template <class T0, class T1, class T2, class T3, class T4,
                 static_cast<const null_type&>(null_type()),
                 static_cast<const null_type&>(null_type())) {}
 
-  inline __host__ __device__ 
+  inline __host__ __device__
   tuple(typename access_traits<T0>::parameter_type t0,
         typename access_traits<T1>::parameter_type t1,
         typename access_traits<T2>::parameter_type t2,
@@ -316,7 +307,7 @@ template <class T0, class T1, class T2, class T3, class T4,
                 static_cast<const null_type&>(null_type()),
                 static_cast<const null_type&>(null_type())) {}
 
-  inline __host__ __device__ 
+  inline __host__ __device__
   tuple(typename access_traits<T0>::parameter_type t0,
         typename access_traits<T1>::parameter_type t1,
         typename access_traits<T2>::parameter_type t2,
@@ -329,7 +320,7 @@ template <class T0, class T1, class T2, class T3, class T4,
                 static_cast<const null_type&>(null_type()),
                 static_cast<const null_type&>(null_type())) {}
 
-  inline __host__ __device__ 
+  inline __host__ __device__
   tuple(typename access_traits<T0>::parameter_type t0,
         typename access_traits<T1>::parameter_type t1,
         typename access_traits<T2>::parameter_type t2,
@@ -342,7 +333,7 @@ template <class T0, class T1, class T2, class T3, class T4,
     : inherited(t0, t1, t2, t3, t4, t5, t6, t7, t8,
                 static_cast<const null_type&>(null_type())) {}
 
-  inline __host__ __device__ 
+  inline __host__ __device__
   tuple(typename access_traits<T0>::parameter_type t0,
         typename access_traits<T1>::parameter_type t1,
         typename access_traits<T2>::parameter_type t2,
@@ -357,12 +348,12 @@ template <class T0, class T1, class T2, class T3, class T4,
 
 
   template<class U1, class U2>
-  inline __host__ __device__ 
+  inline __host__ __device__
   tuple(const detail::cons<U1, U2>& p) : inherited(p) {}
 
   __thrust_exec_check_disable__
   template <class U1, class U2>
-  inline __host__ __device__ 
+  inline __host__ __device__
   tuple& operator=(const detail::cons<U1, U2>& k)
   {
     inherited::operator=(k);
@@ -581,5 +572,4 @@ bool operator>(const null_type&, const null_type&);
 /*! \} // utility
  */
 
-} // end thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/type_traits/integer_sequence.h b/thrust/type_traits/integer_sequence.h
index 4d04653d1..26ea54213 100644
--- a/thrust/type_traits/integer_sequence.h
+++ b/thrust/type_traits/integer_sequence.h
@@ -1,14 +1,23 @@
-///////////////////////////////////////////////////////////////////////////////
-//  Copyright (c)      2018 NVIDIA Corporation
-//  Copyright (c) 2015-2018 Bryce Adelstein Lelbach aka wash
-//
-//  Distributed under the Boost Software License, Version 1.0. (See accompanying
-//  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
-///////////////////////////////////////////////////////////////////////////////
+/*
+ *  Copyright 2008-2021 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
 
-/*! \file integer_sequence.h
- *  \brief C++14's \c integer_sequence and associated helper aliases plus some
- *         extensions.
+/*! \file
+ *  \brief C++14's
+ *  <a href="https://en.cppreference.com/w/cpp/utility/integer_sequence"><tt>std::index_sequence</tt></a>,
+ *  associated helper aliases, and some related extensions.
  */
 
 #pragma once
@@ -23,46 +32,90 @@
 #include <cstdint>
 #include <utility>
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
 
-#if THRUST_CPP_DIALECT >= 2014
+/*! \addtogroup utility
+ *  \{
+ */
+
+/*! \addtogroup type_traits Type Traits
+ *  \{
+ */
 
-// A compile-time sequence of integral constants of type T.
+/*! \brief A compile-time sequence of
+ *  <a href="https://en.cppreference.com/w/cpp/language/constant_expression#Integral_constant_expression"><i>integral constants</i></a>
+ *  of type \c T with values <tt>Is...</tt>.
+ *
+ *  \see <a href="https://en.cppreference.com/w/cpp/language/constant_expression#Integral_constant_expression"><i>integral constants</i></a>
+ *  \see index_sequence
+ *  \see make_integer_sequence
+ *  \see make_reversed_integer_sequence
+ *  \see make_index_sequence
+ *  \see make_reversed_index_sequence
+ *  \see integer_sequence_push_front
+ *  \see integer_sequence_push_back
+ *  \see <a href="https://en.cppreference.com/w/cpp/utility/integer_sequence"><tt>std::integer_sequence</tt></a>
+ */
+#if THRUST_CPP_DIALECT >= 2014
 template <typename T, T... Is>
 using integer_sequence = std::integer_sequence<T, Is...>;
+#else
+template <typename T, T... Is>
+struct integer_sequence
+{
+  using type = integer_sequence;
+  using value_type = T;
+  using size_type = std::size_t;
 
-// A compile-time sequence of std::size_t constants.
-template <std::size_t... Is>
-using index_sequence = std::index_sequence<Is...>;
-
-// Create a new integer_sequence with elements 0, 1, 2, ..., N - 1.
-template <typename T, std::size_t N>
-using make_integer_sequence = std::make_integer_sequence<T, N>;
-
-// Create a new index_sequence with elements 0, 1, 2, ..., N - 1.
-template <std::size_t N>
-using make_index_sequence = std::make_index_sequence<N>;
+  __host__ __device__
+  static constexpr size_type size() noexcept
+  {
+    return sizeof...(Is);
+  }
+};
+#endif
 
 ///////////////////////////////////////////////////////////////////////////////
 
-#else // Older than C++14.
-
-// A compile-time sequence of integral constants of type T.
-template <typename T, T... Is>
-struct integer_sequence;
-
-// A compile-time sequence of std::size_t constants.
+/*! \brief A compile-time sequence of type
+ *  <a href="https://en.cppreference.com/w/cpp/types/size_t">std::size_t</a>
+ *  with values <tt>Is...</tt>.
+ *
+ *  \see integer_sequence
+ *  \see make_integer_sequence
+ *  \see make_reversed_integer_sequence
+ *  \see make_index_sequence
+ *  \see make_reversed_index_sequence
+ *  \see integer_sequence_push_front
+ *  \see integer_sequence_push_back
+ *  \see <a href="https://en.cppreference.com/w/cpp/utility/integer_sequence"><tt>std::index_sequence</tt></a>
+ */
+#if THRUST_CPP_DIALECT >= 2014
+template <std::size_t... Is>
+using index_sequence = std::index_sequence<Is...>;
+#else
 template <std::size_t... Is>
 using index_sequence = integer_sequence<std::size_t, Is...>;
+#endif
 
-///////////////////////////////////////////////////////////////////////////////
+#if THRUST_CPP_DIALECT < 2014
+/*! \cond
+ */
 
 namespace detail
 {
 
-// Create a new integer_sequence containing the elements of Sequence0 followed
-// by the elements of Sequence1. Sequence0::size() is added to each element from
-// Sequence1 in the new sequence.
+/*! \brief Create a new \c integer_sequence containing the elements of \c
+ * Sequence0 followed by the elements of \c Sequence1. \c Sequence0::size() is
+ * added to each element from \c Sequence1 in the new sequence.
+ *
+ *  \see integer_sequence
+ *  \see index_sequence
+ *  \see make_reversed_integer_sequence
+ *  \see make_index_sequence
+ *  \see make_reversed_index_sequence
+ *  \see merge_and_renumber_reversed_integer_sequences_impl
+ */
 template <typename Sequence0, typename Sequence1>
   struct merge_and_renumber_integer_sequences_impl;
 template <typename Sequence0, typename Sequence1>
@@ -71,41 +124,35 @@ template <typename Sequence0, typename Sequence1>
           Sequence0, Sequence1
       >::type;
 
-// Create a new integer_sequence with elements 0, 1, 2, ..., N - 1.
 template <typename T, std::size_t N>
   struct make_integer_sequence_impl;
 
-
 } // namespace detail
 
-///////////////////////////////////////////////////////////////////////////////
-
-// Create a new integer_sequence with elements 0, 1, 2, ..., N - 1.
+/*! \endcond
+ */
+#endif
+
+/*! \brief Create a new \c integer_sequence with elements
+ *  <tt>0, 1, 2, ..., N - 1</tt> of type \c T.
+ *
+ *  \see integer_sequence
+ *  \see index_sequence
+ *  \see make_reversed_integer_sequence
+ *  \see make_index_sequence
+ *  \see make_reversed_index_sequence
+ *  \see <a href="https://en.cppreference.com/w/cpp/utility/integer_sequence"><tt>std::make_integer_sequence</tt></a>
+ */
+#if THRUST_CPP_DIALECT >= 2014
+template <typename T, std::size_t N>
+using make_integer_sequence = std::make_integer_sequence<T, N>;
+#else
 template <typename T, std::size_t N>
 using make_integer_sequence =
   typename detail::make_integer_sequence_impl<T, N>::type;
 
-// Create a new index_sequence with elements 0, 1, 2, ..., N - 1.
-template <std::size_t N>
-using make_index_sequence =
-  make_integer_sequence<std::size_t, N>;
-
-///////////////////////////////////////////////////////////////////////////////
-
-template <typename T, T... Is>
-struct integer_sequence
-{
-  using type = integer_sequence;
-  using value_type = T;
-  using size_type = std::size_t;
-
-  __host__ __device__
-  static constexpr size_type size() noexcept
-  {
-    return sizeof...(Is);
-  }
-};
-///////////////////////////////////////////////////////////////////////////////
+/*! \cond
+ */
 
 namespace detail
 {
@@ -118,8 +165,6 @@ struct merge_and_renumber_integer_sequences_impl<
   using type = integer_sequence<T, Is0..., (sizeof...(Is0) + Is1)...>;
 };
 
-///////////////////////////////////////////////////////////////////////////////
-
 template <typename T, std::size_t N>
 struct make_integer_sequence_impl
 {
@@ -143,16 +188,53 @@ struct make_integer_sequence_impl<T, 1>
 
 } // namespace detail
 
-#endif // THRUST_CPP_DIALECT >= 2014
+/*! \endcond
+ */
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+
+/*! \brief Create a new \c integer_sequence with elements
+ *  <tt>0, 1, 2, ..., N - 1</tt> of type
+ *  <a href="https://en.cppreference.com/w/cpp/types/size_t">std::size_t</a>.
+ *
+ *  \see integer_sequence
+ *  \see index_sequence
+ *  \see make_integer_sequence
+ *  \see make_reversed_integer_sequence
+ *  \see make_reversed_index_sequence
+ *  \see <a href="https://en.cppreference.com/w/cpp/utility/integer_sequence"><tt>std::make_index_sequence</tt></a>
+ */
+#if THRUST_CPP_DIALECT >= 2014
+template <std::size_t N>
+using make_index_sequence = std::make_index_sequence<N>;
+#else
+template <std::size_t N>
+using make_index_sequence =
+  make_integer_sequence<std::size_t, N>;
+#endif
 
 ///////////////////////////////////////////////////////////////////////////////
 
+/*! \cond
+ */
+
 namespace detail
 {
 
-// Create a new integer_sequence containing the elements of Sequence0 followed
-// by the elements of Sequence1. Sequence1::size() is added to each element from
-// Sequence0 in the new sequence.
+/*! \brief Create a new \c integer_sequence containing the elements of \c
+ *  Sequence0 followed by the elements of \c Sequence1. \c Sequence1::size() is
+ *  added to each element from \c Sequence0 in the new sequence.
+ *
+ *  \see make_reversed_integer_sequence
+ *  \see integer_sequence
+ *  \see index_sequence
+ *  \see make_integer_sequence
+ *  \see make_reversed_integer_sequence
+ *  \see make_index_sequence
+ *  \see make_reversed_index_sequence
+ *  \see merge_and_renumber_integer_sequences_impl
+ */
 template <typename Sequence0, typename Sequence1>
   struct merge_and_renumber_reversed_integer_sequences_impl;
 template <typename Sequence0, typename Sequence1>
@@ -161,56 +243,85 @@ template <typename Sequence0, typename Sequence1>
           Sequence0, Sequence1
       >::type;
 
-// Create a new integer_sequence with elements N - 1, N - 2, N - 3, ..., 0.
 template <typename T, std::size_t N>
 struct make_reversed_integer_sequence_impl;
 
-// Add a new element to the front of an integer_sequence<>.
-template <typename T, T I, typename Sequence> 
+template <typename T, T Value, typename Sequence>
 struct integer_sequence_push_front_impl;
 
-// Add a new element to the back of an integer_sequence<>.
-template <typename T, T I, typename Sequence> 
+template <typename T, T Value, typename Sequence>
 struct integer_sequence_push_back_impl;
 
-}
+template <typename T, T... Is0, T... Is1>
+struct merge_and_renumber_reversed_integer_sequences_impl<
+  integer_sequence<T, Is0...>, integer_sequence<T, Is1...>
+>
+{
+  using type = integer_sequence<T, (sizeof...(Is1) + Is0)..., Is1...>;
+};
+
+} // namespace detail
+
+/*! \endcond
+ */
 
 ///////////////////////////////////////////////////////////////////////////////
 
-// Create a new integer_sequence with elements N - 1, N - 2, N - 3, ..., 0.
+/*! \brief Create a new \c integer_sequence with elements
+ *  <tt>N - 1, N - 2, N - 3, ..., 0</tt>.
+ *
+ *  \see integer_sequence
+ *  \see index_sequence
+ *  \see make_integer_sequence
+ *  \see make_index_sequence
+ *  \see make_reversed_index_sequence
+ */
 template <typename T, std::size_t N>
 using make_reversed_integer_sequence =
   typename detail::make_reversed_integer_sequence_impl<T, N>::type;
 
-// Create a new index_sequence with elements N - 1, N - 2, N - 3, ..., 0.
+/*! \brief Create a new \c index_sequence with elements
+ *  <tt>N - 1, N - 2, N - 3, ..., 0</tt>.
+ *
+ *  \see integer_sequence
+ *  \see index_sequence
+ *  \see make_integer_sequence
+ *  \see make_reversed_integer_sequence
+ *  \see make_reversed_index_sequence
+ */
 template <std::size_t N>
 using make_reversed_index_sequence =
   make_reversed_integer_sequence<std::size_t, N>;
 
-// Add a new element to the front of an integer_sequence<>.
-template <typename T, T I, typename Sequence> 
+/*! \brief Add a new element to the front of an \c integer_sequence.
+ *
+ *  \see integer_sequence
+ *  \see index_sequence
+ *  \see make_integer_sequence
+ *  \see make_index_sequence
+ */
+template <typename T, T Value, typename Sequence>
 using integer_sequence_push_front =
-  typename detail::integer_sequence_push_front_impl<T, I, Sequence>::type;
-
-// Add a new element to the back of an integer_sequence<>.
-template <typename T, T I, typename Sequence> 
+  typename detail::integer_sequence_push_front_impl<T, Value, Sequence>::type;
+
+/*! \brief Add a new element to the back of an \c integer_sequence.
+ *
+ *  \see integer_sequence
+ *  \see index_sequence
+ *  \see make_integer_sequence
+ *  \see make_index_sequence
+ */
+template <typename T, T Value, typename Sequence>
 using integer_sequence_push_back =
-  typename detail::integer_sequence_push_back_impl<T, I, Sequence>::type;
+  typename detail::integer_sequence_push_back_impl<T, Value, Sequence>::type;
 
 ///////////////////////////////////////////////////////////////////////////////
 
-namespace detail
-{
+/*! \cond
+ */
 
-template <typename T, T... Is0, T... Is1>
-struct merge_and_renumber_reversed_integer_sequences_impl<
-  integer_sequence<T, Is0...>, integer_sequence<T, Is1...>
->
+namespace detail
 {
-  using type = integer_sequence<T, (sizeof...(Is1) + Is0)..., Is1...>;
-};
-
-///////////////////////////////////////////////////////////////////////////////
 
 template <typename T, std::size_t N>
 struct make_reversed_integer_sequence_impl
@@ -237,7 +348,7 @@ struct make_reversed_integer_sequence_impl<T, 1>
 
 ///////////////////////////////////////////////////////////////////////////////
 
-template <typename T, T I0, T... Is> 
+template <typename T, T I0, T... Is>
 struct integer_sequence_push_front_impl<T, I0, integer_sequence<T, Is...> >
 {
   using type = integer_sequence<T, I0, Is...>;
@@ -245,7 +356,7 @@ struct integer_sequence_push_front_impl<T, I0, integer_sequence<T, Is...> >
 
 ///////////////////////////////////////////////////////////////////////////////
 
-template <typename T, T I0, T... Is> 
+template <typename T, T I0, T... Is>
 struct integer_sequence_push_back_impl<T, I0, integer_sequence<T, Is...> >
 {
   using type = integer_sequence<T, Is..., I0>;
@@ -255,7 +366,16 @@ struct integer_sequence_push_back_impl<T, I0, integer_sequence<T, Is...> >
 
 } // namespace detail
 
-THRUST_END_NS
+/*! \endcond
+ */
+
+/*! \} // type traits
+ */
+
+/*! \} // utility
+ */
+
+THRUST_NAMESPACE_END
 
 #endif // THRUST_CPP_DIALECT >= 2011
 
diff --git a/thrust/type_traits/is_contiguous_iterator.h b/thrust/type_traits/is_contiguous_iterator.h
index 9e704dc31..eaa088978 100644
--- a/thrust/type_traits/is_contiguous_iterator.h
+++ b/thrust/type_traits/is_contiguous_iterator.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2018 NVIDIA Corporation
+ *  Copyright 2008-2021 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -14,21 +14,24 @@
  *  limitations under the License.
  */
 
-/*! \file is_contiguous_iterator.h
- *  \brief An extensible type trait for determining if an iterator satisifies
- *         the <a href="https://en.cppreference.com/w/cpp/named_req/ContiguousIterator">ContiguousIterator</a>
- *         requirements (e.g. is pointer-like).
+/*! \file
+ *  \brief An extensible type trait for determining if an iterator satisifies the
+ *  <a href="https://en.cppreference.com/w/cpp/named_req/ContiguousIterator">ContiguousIterator</a>
+ *  requirements (aka is pointer-like).
  */
 
 #pragma once
 
 #include <thrust/detail/config.h>
+#include <thrust/detail/raw_pointer_cast.h>
 #include <thrust/detail/type_traits.h>
 #include <thrust/detail/type_traits/pointer_traits.h>
 
 #include <iterator>
+#include <type_traits>
+#include <utility>
 
-#if defined(_MSC_VER) && _MSC_VER < 1916 // MSVC 2017 version 15.9
+#if THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC && _MSC_VER < 1916 // MSVC 2017 version 15.9
   #include <vector>
   #include <string>
   #include <array>
@@ -38,7 +41,18 @@
   #endif
 #endif
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
+
+/*! \addtogroup utility
+ *  \{
+ */
+
+/*! \addtogroup type_traits Type Traits
+ *  \{
+ */
+
+/*! \cond
+ */
 
 namespace detail
 {
@@ -48,10 +62,19 @@ struct is_contiguous_iterator_impl;
 
 } // namespace detail
 
-/// Unary metafunction returns \c true_type if \c Iterator satisfies
-/// <a href="https://en.cppreference.com/w/cpp/named_req/ContiguousIterator">ContiguousIterator</a>,
-/// e.g. it points to elements that are contiguous in memory, and \c false_type
-/// otherwise.
+/*! \endcond
+ */
+
+/*! \brief <a href="https://en.cppreference.com/w/cpp/named_req/UnaryTypeTrait"><i>UnaryTypeTrait</i></a>
+ *  that returns \c true_type if \c Iterator satisfies
+ *  <a href="https://en.cppreference.com/w/cpp/named_req/ContiguousIterator">ContiguousIterator</a>,
+ *  aka it points to elements that are contiguous in memory, and \c false_type
+ *  otherwise.
+ *
+ * \see is_contiguous_iterator_v
+ * \see proclaim_contiguous_iterator
+ * \see THRUST_PROCLAIM_CONTIGUOUS_ITERATOR
+ */
 template <typename Iterator>
 #if THRUST_CPP_DIALECT >= 2011
 using is_contiguous_iterator =
@@ -65,32 +88,47 @@ struct is_contiguous_iterator :
 ;
 
 #if THRUST_CPP_DIALECT >= 2014
-/// <code>constexpr bool</code> that is \c true if \c Iterator satisfies
-/// <a href="https://en.cppreference.com/w/cpp/named_req/ContiguousIterator">ContiguousIterator</a>,
-/// e.g. it points to elements that are contiguous in memory, and \c false
-/// otherwise.
+/*! \brief <tt>constexpr bool</tt> that is \c true if \c Iterator satisfies
+ *  <a href="https://en.cppreference.com/w/cpp/named_req/ContiguousIterator">ContiguousIterator</a>,
+ *  aka it points to elements that are contiguous in memory, and \c false
+ *  otherwise.
+ *
+ * \see is_contiguous_iterator
+ * \see proclaim_contiguous_iterator
+ * \see THRUST_PROCLAIM_CONTIGUOUS_ITERATOR
+ */
 template <typename Iterator>
 constexpr bool is_contiguous_iterator_v = is_contiguous_iterator<Iterator>::value;
 #endif
 
-/// Customization point that can be customized to indicate that an iterator
-/// type \c Iterator satisfies
-/// <a href="https://en.cppreference.com/w/cpp/named_req/ContiguousIterator">ContiguousIterator</a>,
-/// e.g. it points to elements that are contiguous in memory.
+/*! \brief Customization point that can be customized to indicate that an
+ *  iterator type \c Iterator satisfies
+ *  <a href="https://en.cppreference.com/w/cpp/named_req/ContiguousIterator">ContiguousIterator</a>,
+ *  aka it points to elements that are contiguous in memory.
+ *
+ * \see is_contiguous_iterator
+ * \see THRUST_PROCLAIM_CONTIGUOUS_ITERATOR
+ */
 template <typename Iterator>
 struct proclaim_contiguous_iterator : false_type {};
 
-/// Declares that the iterator \c Iterator is
-/// <a href="https://en.cppreference.com/w/cpp/named_req/ContiguousIterator">ContiguousIterator</a>
-/// by specializing `thrust::proclaim_contiguous_iterator`.
+/*! \brief Declares that the iterator \c Iterator is
+ *  <a href="https://en.cppreference.com/w/cpp/named_req/ContiguousIterator">ContiguousIterator</a>
+ *  by specializing \c proclaim_contiguous_iterator.
+ *
+ * \see is_contiguous_iterator
+ * \see proclaim_contiguous_iterator
+ */
 #define THRUST_PROCLAIM_CONTIGUOUS_ITERATOR(Iterator)                         \
-  THRUST_BEGIN_NS                                                             \
+  THRUST_NAMESPACE_BEGIN                                                      \
   template <>                                                                 \
-  struct proclaim_contiguous_iterator<Iterator> : ::thrust::true_type {};     \
-  THRUST_END_NS                                                               \
+  struct proclaim_contiguous_iterator<Iterator>                               \
+      : THRUST_NS_QUALIFIER::true_type {};                                    \
+  THRUST_NAMESPACE_END                                                        \
   /**/
 
-///////////////////////////////////////////////////////////////////////////////
+/*! \cond
+ */
 
 namespace detail
 {
@@ -164,7 +202,6 @@ template <typename Iterator>
 struct is_msvc_contiguous_iterator : false_type {};
 #endif
 
-
 template <typename Iterator>
 struct is_contiguous_iterator_impl
   : integral_constant<
@@ -178,7 +215,81 @@ struct is_contiguous_iterator_impl
     >
 {};
 
+// Type traits for contiguous iterators:
+template <typename Iterator>
+struct contiguous_iterator_traits
+{
+  static_assert(thrust::is_contiguous_iterator<Iterator>::value,
+                "contiguous_iterator_traits requires a contiguous iterator.");
+
+  using raw_pointer = typename thrust::detail::pointer_traits<
+    decltype(&*std::declval<Iterator>())>::raw_pointer;
+};
+
+template <typename Iterator>
+using contiguous_iterator_raw_pointer_t =
+  typename contiguous_iterator_traits<Iterator>::raw_pointer;
+
+// Converts a contiguous iterator to a raw pointer:
+template <typename Iterator>
+__host__ __device__
+contiguous_iterator_raw_pointer_t<Iterator>
+contiguous_iterator_raw_pointer_cast(Iterator it)
+{
+  static_assert(thrust::is_contiguous_iterator<Iterator>::value,
+                "contiguous_iterator_raw_pointer_cast called with "
+                "non-contiguous iterator.");
+  return thrust::raw_pointer_cast(&*it);
+}
+
+// Implementation for non-contiguous iterators -- passthrough.
+template <typename Iterator,
+          bool IsContiguous = thrust::is_contiguous_iterator<Iterator>::value>
+struct try_unwrap_contiguous_iterator_impl
+{
+  using type = Iterator;
+
+  static __host__ __device__ type get(Iterator it) { return it; }
+};
+
+// Implementation for contiguous iterators -- unwraps to raw pointer.
+template <typename Iterator>
+struct try_unwrap_contiguous_iterator_impl<Iterator, true /*is_contiguous*/>
+{
+  using type = contiguous_iterator_raw_pointer_t<Iterator>;
+
+  static __host__ __device__ type get(Iterator it)
+  {
+    return contiguous_iterator_raw_pointer_cast(it);
+  }
+};
+
+template <typename Iterator>
+using try_unwrap_contiguous_iterator_return_t =
+  typename try_unwrap_contiguous_iterator_impl<Iterator>::type;
+
+// Casts to a raw pointer if iterator is marked as contiguous, otherwise returns
+// the input iterator.
+template <typename Iterator>
+__host__ __device__
+try_unwrap_contiguous_iterator_return_t<Iterator>
+try_unwrap_contiguous_iterator(Iterator it)
+{
+  return try_unwrap_contiguous_iterator_impl<Iterator>::get(it);
+}
+
 } // namespace detail
 
-THRUST_END_NS
+/*! \endcond
+ */
+
+///////////////////////////////////////////////////////////////////////////////
+
+/*! \} // type traits
+ */
+
+/*! \} // utility
+ */
+
+THRUST_NAMESPACE_END
 
diff --git a/thrust/type_traits/is_execution_policy.h b/thrust/type_traits/is_execution_policy.h
index 5412e6c44..f83751ea2 100644
--- a/thrust/type_traits/is_execution_policy.h
+++ b/thrust/type_traits/is_execution_policy.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2018 NVIDIA Corporation
+ *  Copyright 2008-2021 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -14,6 +14,10 @@
  *  limitations under the License.
  */
 
+/*! \file
+ *  \brief A type trait that determines if a type is an \a ExecutionPolicy.
+ */
+
 #pragma once
 
 #include <thrust/detail/config.h>
@@ -21,10 +25,20 @@
 #include <thrust/detail/type_traits.h>
 #include <thrust/detail/execution_policy.h>
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
+
+/*! \addtogroup utility
+ *  \{
+ */
+
+/*! \addtogroup type_traits Type Traits
+ *  \{
+ */
 
-/// Unary metafunction that is \c true if \c T is an \a ExecutionPolicy and
-/// \c false otherwise.
+/*! \brief <a href="https://en.cppreference.com/w/cpp/named_req/UnaryTypeTrait"><i>UnaryTypeTrait</i></a>
+ *  that returns \c true_type if \c T is an \a ExecutionPolicy and \c false_type
+ *  otherwise.
+ */
 template <typename T>
 #if THRUST_CPP_DIALECT >= 2011
 using is_execution_policy =
@@ -37,13 +51,19 @@ struct is_execution_policy :
 #endif
 ;
 
-/// <CODE>constexpr bool</CODE> that is \c true if \c T is an \a ExecutionPolicy
-/// and \c false otherwise.
 #if THRUST_CPP_DIALECT >= 2014
+/*! \brief <tt>constexpr bool</tt> that is \c true if \c T is an
+ *  \a ExecutionPolicy and \c false otherwise.
+ */
 template <typename T>
 constexpr bool is_execution_policy_v = is_execution_policy<T>::value;
 #endif
 
-THRUST_END_NS
+/*! \} // type traits
+ */
+
+/*! \} // utility
+ */
 
+THRUST_NAMESPACE_END
 
diff --git a/thrust/type_traits/is_operator_less_or_greater_function_object.h b/thrust/type_traits/is_operator_less_or_greater_function_object.h
index 4fb53bda5..ef5a19f69 100644
--- a/thrust/type_traits/is_operator_less_or_greater_function_object.h
+++ b/thrust/type_traits/is_operator_less_or_greater_function_object.h
@@ -1,6 +1,5 @@
-
 /*
- *  Copyright 2008-2018 NVIDIA Corporation
+ *  Copyright 2008-2021 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -15,9 +14,10 @@
  *  limitations under the License.
  */
 
-/*! \file is_operator_less_or_greater_function_object.h
- *  \brief Type traits for determining if a \c BinaryFunction is equivalent to
-///        either \c operator< or \c operator>.
+/*! \file
+ *  \brief Type traits for determining if a
+ *  <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">BinaryPredicate</a>
+ *  is equivalent to either \c operator< or \c operator>.
  */
 
 #pragma once
@@ -27,75 +27,127 @@
 #include <thrust/detail/type_traits.h>
 #include <thrust/detail/type_traits/pointer_traits.h>
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
+
+/*! \addtogroup utility
+ *  \{
+ */
+
+/*! \addtogroup type_traits Type Traits
+ *  \{
+ */
+
+/*! \cond
+ */
 
 namespace detail
 {
 
-template <typename FunctionObject>
+template <typename T>
 struct is_operator_less_function_object_impl;
 
-template <typename FunctionObject>
+template <typename T>
 struct is_operator_greater_function_object_impl;
 
 } // namespace detail
 
-/// Unary metafunction returns \c true_type if \c FunctionObject is equivalent
-/// to \c operator<, and \c false_type otherwise.
-template <typename FunctionObject>
+/*! \endcond
+ */
+
+/*! \brief <a href="https://en.cppreference.com/w/cpp/named_req/UnaryTypeTrait"><i>UnaryTypeTrait</i></a>
+ *  that returns \c true_type if \c T is a
+ *  <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">BinaryPredicate</a>
+ *  equivalent to \c operator<, and \c false_type otherwise.
+ *
+ *  \see is_operator_less_function_object_v
+ *  \see is_operator_greater_function_object
+ *  \see is_operator_less_or_greater_function_object
+ *  \see is_operator_plus_function_object
+ */
+template <typename T>
 #if THRUST_CPP_DIALECT >= 2011
 using is_operator_less_function_object =
 #else
 struct is_operator_less_function_object :
 #endif
-  detail::is_operator_less_function_object_impl<FunctionObject>
+  detail::is_operator_less_function_object_impl<T>
 #if THRUST_CPP_DIALECT < 2011
 {}
 #endif
 ;
 
 #if THRUST_CPP_DIALECT >= 2014
-/// <code>constexpr bool</code> that is \c true if \c FunctionObject is
-/// equivalent to \c operator<, and \c false otherwise.
-template <typename FunctionObject>
+/*! \brief <tt>constexpr bool</tt> that is \c true if \c T is a
+ *  <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">BinaryPredicate</a>
+ *  equivalent to \c operator<, and \c false otherwise.
+ *
+ *  \see is_operator_less_function_object
+ *  \see is_operator_greater_function_object
+ *  \see is_operator_less_or_greater_function_object
+ *  \see is_operator_plus_function_object
+ */
+template <typename T>
 constexpr bool is_operator_less_function_object_v
-  = is_operator_less_function_object<FunctionObject>::value;
+  = is_operator_less_function_object<T>::value;
 #endif
 
-/// Unary metafunction returns \c true_type if \c FunctionObject is equivalent
-/// to \c operator>, and \c false_type otherwise.
-template <typename FunctionObject>
+/*! \brief <a href="https://en.cppreference.com/w/cpp/named_req/UnaryTypeTrait"><i>UnaryTypeTrait</i></a>
+ *  that returns \c true_type if \c T is a
+ *  <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">BinaryPredicate</a>
+ *  equivalent to \c operator>, and \c false_type otherwise.
+ *
+ *  \see is_operator_greater_function_object_v
+ *  \see is_operator_less_function_object
+ *  \see is_operator_less_or_greater_function_object
+ *  \see is_operator_plus_function_object
+ */
+template <typename T>
 #if THRUST_CPP_DIALECT >= 2011
 using is_operator_greater_function_object =
 #else
 struct is_operator_greater_function_object :
 #endif
-  detail::is_operator_greater_function_object_impl<FunctionObject>
+  detail::is_operator_greater_function_object_impl<T>
 #if THRUST_CPP_DIALECT < 2011
 {}
 #endif
 ;
 
 #if THRUST_CPP_DIALECT >= 2014
-/// <code>constexpr bool</code> that is \c true if \c FunctionObject is
-/// equivalent to \c operator>, and \c false otherwise.
-template <typename FunctionObject>
+/*! \brief <tt>constexpr bool</tt> that is \c true if \c T is a
+ *  <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">BinaryPredicate</a>
+ *  equivalent to \c operator>, and \c false otherwise.
+ *
+ *  \see is_operator_greater_function_object
+ *  \see is_operator_less_function_object
+ *  \see is_operator_less_or_greater_function_object
+ *  \see is_operator_plus_function_object
+ */
+template <typename T>
 constexpr bool is_operator_greater_function_object_v
-  = is_operator_greater_function_object<FunctionObject>::value;
+  = is_operator_greater_function_object<T>::value;
 #endif
 
-/// Unary metafunction returns \c true_type if \c FunctionObject is equivalent
-/// to either \c operator<, and \c false_type otherwise.
-template <typename FunctionObject>
+/*! \brief <a href="https://en.cppreference.com/w/cpp/named_req/UnaryTypeTrait"><i>UnaryTypeTrait</i></a>
+ *  that returns \c true_type if \c T is a
+ *  <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">BinaryPredicate</a>
+ *  equivalent to \c operator< or \c operator>, and \c false_type otherwise.
+ *
+ *  \see is_operator_less_or_greater_function_object_v
+ *  \see is_operator_less_function_object
+ *  \see is_operator_greater_function_object
+ *  \see is_operator_plus_function_object
+ */
+template <typename T>
 #if THRUST_CPP_DIALECT >= 2011
 using is_operator_less_or_greater_function_object =
 #else
 struct is_operator_less_or_greater_function_object :
 #endif
   integral_constant<
-    bool 
-  ,    detail::is_operator_less_function_object_impl<FunctionObject>::value
-    || detail::is_operator_greater_function_object_impl<FunctionObject>::value
+    bool
+  ,    detail::is_operator_less_function_object_impl<T>::value
+    || detail::is_operator_greater_function_object_impl<T>::value
   >
 #if THRUST_CPP_DIALECT < 2011
 {}
@@ -103,26 +155,36 @@ struct is_operator_less_or_greater_function_object :
 ;
 
 #if THRUST_CPP_DIALECT >= 2014
-/// <code>constexpr bool</code> that is \c true if \c FunctionObject is
-/// equivalent to either \c operator< or \c operator>, and \c false otherwise.
-template <typename FunctionObject>
+/*! \brief <tt>constexpr bool</tt> that is \c true if \c T is a
+ *  <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">BinaryPredicate</a>
+ *  equivalent to \c operator< or \c operator>, and \c false otherwise.
+ *
+ *  \see is_operator_less_or_greater_function_object
+ *  \see is_operator_less_function_object
+ *  \see is_operator_greater_function_object
+ *  \see is_operator_plus_function_object
+ */
+template <typename T>
 constexpr bool is_operator_less_or_greater_function_object_v
-  = is_operator_less_or_greater_function_object<FunctionObject>::value;
+  = is_operator_less_or_greater_function_object<T>::value;
 #endif
 
 ///////////////////////////////////////////////////////////////////////////////
 
+/*! \cond
+ */
+
 namespace detail
 {
 
-template <typename FunctionObject>
+template <typename T>
 struct is_operator_less_function_object_impl                   : false_type {};
 template <typename T>
 struct is_operator_less_function_object_impl<thrust::less<T> > : true_type {};
 template <typename T>
 struct is_operator_less_function_object_impl<std::less<T>    > : true_type {};
 
-template <typename FunctionObject>
+template <typename T>
 struct is_operator_greater_function_object_impl                      : false_type {};
 template <typename T>
 struct is_operator_greater_function_object_impl<thrust::greater<T> > : true_type {};
@@ -131,5 +193,16 @@ struct is_operator_greater_function_object_impl<std::greater<T>    > : true_type
 
 } // namespace detail
 
-THRUST_END_NS
+/*! \endcond
+ */
+
+///////////////////////////////////////////////////////////////////////////////
+
+/*! \} // type traits
+ */
+
+/*! \} // utility
+ */
+
+THRUST_NAMESPACE_END
 
diff --git a/thrust/type_traits/is_operator_plus_function_object.h b/thrust/type_traits/is_operator_plus_function_object.h
index 80481dfb0..800847532 100644
--- a/thrust/type_traits/is_operator_plus_function_object.h
+++ b/thrust/type_traits/is_operator_plus_function_object.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2018 NVIDIA Corporation
+ *  Copyright 2008-2021 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -14,9 +14,10 @@
  *  limitations under the License.
  */
 
-/*! \file is_operator_plus_function_object.h
- *  \brief Type traits for determining if a \c BinaryFunction is equivalent to
-///        \c operator+.
+/*! \file
+ *  \brief Type traits for determining if a
+ *  <a href="https://en.cppreference.com/w/cpp/named_req/FunctionObject">FunctionObject</a>
+ *  is equivalent to \c operator+.
  */
 
 #pragma once
@@ -26,44 +27,76 @@
 #include <thrust/detail/type_traits.h>
 #include <thrust/detail/type_traits/pointer_traits.h>
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
+
+/*! \addtogroup utility
+ *  \{
+ */
+
+/*! \addtogroup type_traits Type Traits
+ *  \{
+ */
+
+/*! \cond
+ */
 
 namespace detail
 {
 
-template <typename FunctionObject>
+template <typename T>
 struct is_operator_plus_function_object_impl;
 
 } // namespace detail
 
-/// Unary metafunction returns \c true_type if \c FunctionObject is equivalent
-/// to \c operator<, and \c false_type otherwise.
-template <typename FunctionObject>
+/*! \endcond
+ */
+
+/*! \brief <a href="https://en.cppreference.com/w/cpp/named_req/UnaryTypeTrait"><i>UnaryTypeTrait</i></a>
+ *  that returns \c true_type if \c T is a
+ *  <a href="https://en.cppreference.com/w/cpp/named_req/FunctionObject">FunctionObject</a>
+ *  equivalent to \c operator+, and \c false_type otherwise.
+ *
+ *  \see is_operator_plus_function_object_v
+ *  \see is_operator_less_function_object
+ *  \see is_operator_greater_function_object
+ *  \see is_operator_less_or_greater_function_object
+ */
+template <typename T>
 #if THRUST_CPP_DIALECT >= 2011
 using is_operator_plus_function_object =
 #else
 struct is_operator_plus_function_object :
 #endif
-  detail::is_operator_plus_function_object_impl<FunctionObject>
+  detail::is_operator_plus_function_object_impl<T>
 #if THRUST_CPP_DIALECT < 2011
 {}
 #endif
 ;
 
 #if THRUST_CPP_DIALECT >= 2014
-/// <code>constexpr bool</code> that is \c true if \c FunctionObject is
-/// equivalent to \c operator<, and \c false otherwise.
-template <typename FunctionObject>
+/*! \brief <tt>constexpr bool</tt> that is \c true if \c T is a
+ *  <a href="https://en.cppreference.com/w/cpp/named_req/FunctionObject">FunctionObject</a>
+ *  equivalent to \c operator<, and \c false otherwise.
+ *
+ *  \see is_operator_plus_function_object
+ *  \see is_operator_less_function_object
+ *  \see is_operator_greater_function_object
+ *  \see is_operator_less_or_greater_function_object
+ */
+template <typename T>
 constexpr bool is_operator_plus_function_object_v
-  = is_operator_plus_function_object<FunctionObject>::value;
+  = is_operator_plus_function_object<T>::value;
 #endif
 
 ///////////////////////////////////////////////////////////////////////////////
 
+/*! \cond
+ */
+
 namespace detail
 {
 
-template <typename FunctionObject>
+template <typename T>
 struct is_operator_plus_function_object_impl                   : false_type {};
 template <typename T>
 struct is_operator_plus_function_object_impl<thrust::plus<T> > : true_type {};
@@ -72,5 +105,14 @@ struct is_operator_plus_function_object_impl<std::plus<T>    > : true_type {};
 
 } // namespace detail
 
-THRUST_END_NS
+/*! \endcond
+ */
+
+/*! \} // type traits
+ */
+
+/*! \} // utility
+ */
+
+THRUST_NAMESPACE_END
 
diff --git a/thrust/type_traits/is_trivially_relocatable.h b/thrust/type_traits/is_trivially_relocatable.h
index 00c614d3b..21d1f09d8 100644
--- a/thrust/type_traits/is_trivially_relocatable.h
+++ b/thrust/type_traits/is_trivially_relocatable.h
@@ -1,14 +1,24 @@
-///////////////////////////////////////////////////////////////////////////////
-//  Copyright (c)      2018 NVIDIA Corporation
-//
-//  Distributed under the Boost Software License, Version 1.0. (See accompanying
-//  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
-///////////////////////////////////////////////////////////////////////////////
+/*
+ *  Copyright 2008-2021 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
 
-/*! \file is_trivially_relocatable.h
- *  \brief <a href="https://wg21.link/P1144R0">P1144R0</a>'s
- *         \c is_trivially_relocatable, an extensible type trait indicating
- *         whether a type can be bitwise copied (e.g. via \c memcpy).
+/*! \file
+ *  \brief <a href="https://wg21.link/P1144">P1144</a>'s proposed
+ *  \c std::is_trivially_relocatable, an extensible type trait indicating
+ *  whether a type can be bitwise copied with a facility like
+ *  <a href="https://en.cppreference.com/w/cpp/string/byte/memcpy"><tt>std::memcpy</tt></a>.
  */
 
 #pragma once
@@ -22,7 +32,18 @@
   #include <type_traits>
 #endif
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
+
+/*! \addtogroup utility
+ *  \{
+ */
+
+/*! \addtogroup type_traits Type Traits
+ *  \{
+ */
+
+/*! \cond
+ */
 
 namespace detail
 {
@@ -32,9 +53,22 @@ struct is_trivially_relocatable_impl;
 
 } // namespace detail
 
-/// Unary metafunction returns \c true_type if \c T is \a TriviallyRelocatable, 
-/// e.g. can be bitwise copied (with a facility like \c memcpy), and
-/// \c false_type otherwise.
+/*! \endcond
+ */
+
+/*! \brief <a href="https://en.cppreference.com/w/cpp/named_req/UnaryTypeTrait"><i>UnaryTypeTrait</i></a>
+ *  that returns \c true_type if \c T is
+ *  <a href="https://wg21.link/P1144"><i>TriviallyRelocatable</i></a>,
+ *  aka can be bitwise copied with a facility like
+ *  <a href="https://en.cppreference.com/w/cpp/string/byte/memcpy"><tt>std::memcpy</tt></a>,
+ *  and \c false_type otherwise.
+ *
+ * \see is_trivially_relocatable_v
+ * \see is_trivially_relocatable_to
+ * \see is_indirectly_trivially_relocatable_to
+ * \see proclaim_trivially_relocatable
+ * \see THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE
+ */
 template <typename T>
 #if THRUST_CPP_DIALECT >= 2011
 using is_trivially_relocatable =
@@ -48,16 +82,35 @@ struct is_trivially_relocatable :
 ;
 
 #if THRUST_CPP_DIALECT >= 2014
-/// <code>constexpr bool</code> that is \c true if \c T is
-/// \a TriviallyRelocatable e.g. can be copied bitwise (with a facility like
-/// \c memcpy), and \c false otherwise.
+/*! \brief <tt>constexpr bool</tt> that is \c true if \c T is
+ *  <a href="https://wg21.link/P1144"><i>TriviallyRelocatable</i></a>,
+ *  aka can be bitwise copied with a facility like
+ *  <a href="https://en.cppreference.com/w/cpp/string/byte/memcpy"><tt>std::memcpy</tt></a>,
+ *  and \c false otherwise.
+ *
+ * \see is_trivially_relocatable
+ * \see is_trivially_relocatable_to
+ * \see is_indirectly_trivially_relocatable_to
+ * \see proclaim_trivially_relocatable
+ * \see THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE
+ */
 template <typename T>
 constexpr bool is_trivially_relocatable_v = is_trivially_relocatable<T>::value;
 #endif
 
-/// Unary metafunction returns \c true_type if \c From is \a TriviallyRelocatable
-/// to \c To, e.g. can be bitwise copied (with a facility like \c memcpy), and
-/// \c false_type otherwise.
+/*! \brief <a href="https://en.cppreference.com/w/cpp/named_req/BinaryTypeTrait"><i>BinaryTypeTrait</i></a>
+ *  that returns \c true_type if \c From is
+ *  <a href="https://wg21.link/P1144"><i>TriviallyRelocatable</i></a>,
+ *  to \c To, aka can be bitwise copied with a facility like
+ *  <a href="https://en.cppreference.com/w/cpp/string/byte/memcpy"><tt>std::memcpy</tt></a>,
+ *  and \c false_type otherwise.
+ *
+ * \see is_trivially_relocatable_to_v
+ * \see is_trivially_relocatable
+ * \see is_indirectly_trivially_relocatable_to
+ * \see proclaim_trivially_relocatable
+ * \see THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE
+ */
 template <typename From, typename To>
 #if THRUST_CPP_DIALECT >= 2011
 using is_trivially_relocatable_to =
@@ -74,17 +127,37 @@ struct is_trivially_relocatable_to :
 ;
 
 #if THRUST_CPP_DIALECT >= 2014
-/// <code>constexpr bool</code> that is \c true if \c From is 
-/// \a TriviallyRelocatable to \c To, e.g. can be copied bitwise (with a
-/// facility like \c memcpy), and \c false otherwise.
+/*! \brief <tt>constexpr bool</tt> that is \c true if \c From is
+ *  <a href="https://wg21.link/P1144"><i>TriviallyRelocatable</i></a>,
+ *  to \c To, aka can be bitwise copied with a facility like
+ *  <a href="https://en.cppreference.com/w/cpp/string/byte/memcpy"><tt>std::memcpy</tt></a>,
+ *  and \c false otherwise.
+ *
+ * \see is_trivially_relocatable_to
+ * \see is_trivially_relocatable
+ * \see is_indirectly_trivially_relocatable_to
+ * \see proclaim_trivially_relocatable
+ * \see THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE
+ */
 template <typename From, typename To>
 constexpr bool is_trivially_relocatable_to_v
   = is_trivially_relocatable_to<From, To>::value;
 #endif
 
-/// Unary metafunction that returns \c true_type if the element type of
-/// \c FromIterator is \a TriviallyRelocatable to the element type of
-/// \c ToIterator, and \c false_type otherwise.
+/*! \brief <a href="https://en.cppreference.com/w/cpp/named_req/BinaryTypeTrait"><i>BinaryTypeTrait</i></a>
+ *  that returns \c true_type if the element type of \c FromIterator is
+ *  <a href="https://wg21.link/P1144"><i>TriviallyRelocatable</i></a>,
+ *  to the element type of \c ToIterator, aka can be bitwise copied with a
+ *  facility like
+ *  <a href="https://en.cppreference.com/w/cpp/string/byte/memcpy"><tt>std::memcpy</tt></a>,
+ *  and \c false_type otherwise.
+ *
+ * \see is_indirectly_trivially_relocatable_to_v
+ * \see is_trivially_relocatable
+ * \see is_trivially_relocatable_to
+ * \see proclaim_trivially_relocatable
+ * \see THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE
+ */
 template <typename FromIterator, typename ToIterator>
 #if THRUST_CPP_DIALECT >= 2011
 using is_indirectly_trivially_relocatable_to =
@@ -106,31 +179,63 @@ struct is_indirectly_trivially_relocatable_to :
 ;
 
 #if THRUST_CPP_DIALECT >= 2014
-/// <code>constexpr bool</code> that is \c true if the element type of
-/// \c FromIterator is \a TriviallyRelocatable to the element type of
-/// \c ToIterator, and \c false otherwise.
+/*! \brief <tt>constexpr bool</tt> that is \c true if the element type of
+ *  \c FromIterator is
+ *  <a href="https://wg21.link/P1144"><i>TriviallyRelocatable</i></a>,
+ *  to the element type of \c ToIterator, aka can be bitwise copied with a
+ *  facility like
+ *  <a href="https://en.cppreference.com/w/cpp/string/byte/memcpy"><tt>std::memcpy</tt></a>,
+ *  and \c false otherwise.
+ *
+ * \see is_indirectly_trivially_relocatable_to
+ * \see is_trivially_relocatable
+ * \see is_trivially_relocatable_to
+ * \see proclaim_trivially_relocatable
+ * \see THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE
+ */
 template <typename FromIterator, typename ToIterator>
-constexpr bool is_trivial_relocatable_sequence_copy_v
+constexpr bool is_indirectly_trivially_relocate_to_v
   = is_indirectly_trivially_relocatable_to<FromIterator, ToIterator>::value;
 #endif
 
-/// Customization point that can be customized to indicate that a type \c T is
-/// \a TriviallyRelocatable, e.g. can be copied bitwise (with a facility like
-/// \c memcpy).
+/*! \brief <a href="http://eel.is/c++draft/namespace.std#def:customization_point"><i>customization point</i></a>
+ *  that can be specialized customized to indicate that a type \c T is
+ *  <a href="https://wg21.link/P1144"><i>TriviallyRelocatable</i></a>,
+ *  aka it can be bitwise copied with a facility like
+ *  <a href="https://en.cppreference.com/w/cpp/string/byte/memcpy"><tt>std::memcpy</tt></a>.
+ *
+ * \see is_indirectly_trivially_relocatable_to
+ * \see is_trivially_relocatable
+ * \see is_trivially_relocatable_to
+ * \see THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE
+ */
 template <typename T>
 struct proclaim_trivially_relocatable : false_type {};
 
-/// Declares that the type \c T is \a TriviallyRelocatable by specializing
-/// `thrust::proclaim_trivially_relocatable`.
+/*! \brief Declares that the type \c T is
+ *  <a href="https://wg21.link/P1144"><i>TriviallyRelocatable</i></a>,
+ *  aka it can be bitwise copied with a facility like
+ *  <a href="https://en.cppreference.com/w/cpp/string/byte/memcpy"><tt>std::memcpy</tt></a>,
+ *  by specializing \c proclaim_trivially_relocatable.
+ *
+ * \see is_indirectly_trivially_relocatable_to
+ * \see is_trivially_relocatable
+ * \see is_trivially_relocatable_to
+ * \see proclaim_trivially_relocatable
+ */
 #define THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(T)                              \
-  THRUST_BEGIN_NS                                                             \
+  THRUST_NAMESPACE_BEGIN                                                      \
   template <>                                                                 \
-  struct proclaim_trivially_relocatable<T> : ::thrust::true_type {};          \
-  THRUST_END_NS                                                               \
+  struct proclaim_trivially_relocatable<T> : THRUST_NS_QUALIFIER::true_type   \
+  {};                                                                         \
+  THRUST_NAMESPACE_END                                                        \
   /**/
 
 ///////////////////////////////////////////////////////////////////////////////
 
+/*! \cond
+ */
+
 namespace detail
 {
 
@@ -185,7 +290,7 @@ struct is_trivially_relocatable_impl<T[N]> : is_trivially_relocatable_impl<T> {}
 
 } // namespace detail
 
-THRUST_END_NS
+THRUST_NAMESPACE_END
 
 #if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
 
@@ -248,3 +353,14 @@ THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(double3)
 THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(double4)
 #endif
 
+/*! \endcond
+ */
+
+///////////////////////////////////////////////////////////////////////////////
+
+/*! \} // type traits
+ */
+
+/*! \} // utility
+ */
+
diff --git a/thrust/type_traits/logical_metafunctions.h b/thrust/type_traits/logical_metafunctions.h
index dbcc18382..914b477e8 100644
--- a/thrust/type_traits/logical_metafunctions.h
+++ b/thrust/type_traits/logical_metafunctions.h
@@ -1,13 +1,25 @@
-///////////////////////////////////////////////////////////////////////////////
-//  Copyright (c)      2018 NVIDIA Corporation
-//  Copyright (c) 2015-2018 Bryce Adelstein Lelbach aka wash
-//
-//  Distributed under the Boost Software License, Version 1.0. (See accompanying
-//  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
-///////////////////////////////////////////////////////////////////////////////
+/*
+ *  Copyright 2008-2021 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
 
-/*! \file logical_metafunctions.h
- *  \brief C++17's \c conjunction, \c disjunction, and \c negation metafunctions.
+/*! \file
+ *  \brief C++17's
+ *  <a href="https://en.cppreference.com/w/cpp/types/conjunction"><tt>std::conjunction</tt></a>,
+ *  <a href="https://en.cppreference.com/w/cpp/types/disjunction"><tt>std::disjunction</tt></a>,
+ *  and <a href="https://en.cppreference.com/w/cpp/types/negation"><tt>std::negation</tt></a>
+ *  metafunctions and related extensions.
  */
 
 #pragma once
@@ -19,47 +31,32 @@
 
 #include <type_traits>
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
 
-#if THRUST_CPP_DIALECT >= 2017
-
-/// An \c integral_constant whose value is <code>(... && Ts::value)</code>. 
-template <typename... Ts>
-using conjunction = std::conjunction<Ts...>;
-
-/// A <code>constexpr bool</code> whose value is <code>(... && Ts::value)</code>.
-template <typename... Ts>
-constexpr bool conjunction_v = conjunction<Ts...>::value;
+/*! \addtogroup utility
+ *  \{
+ */
 
-/// An \c integral_constant whose value is <code>(... || Ts::value)</code>. 
-template <typename... Ts>
-using disjunction = std::disjunction<Ts...>;
+/*! \addtogroup type_traits Type Traits
+ *  \{
+ */
 
-/// A <code>constexpr bool</code> whose value is <code>(... || Ts::value)</code>.
+/*! \brief <a href="https://en.cppreference.com/w/cpp/types/integral_constant"><tt>std::integral_constant</tt></a>
+ *  whose value is <tt>(... && Ts::value)</tt>.
+ *
+ *  \see conjunction_v
+ *  \see conjunction_value
+ *  \see <a href="https://en.cppreference.com/w/cpp/types/conjunction"><tt>std::conjunction</tt></a>
+ */
+#if THRUST_CPP_DIALECT >= 2017
 template <typename... Ts>
-constexpr bool disjunction_v = disjunction<Ts...>::value;
-
-/// An \c integral_constant whose value is <code>!Ts::value</code>. 
-template <typename T>
-using negation = std::negation<T>;
-
-/// A <code>constexpr bool</code> whose value is <code>!Ts::value</code>.
-template <typename T>
-constexpr bool negation_v = negation<T>::value;
-
-///////////////////////////////////////////////////////////////////////////////
-
+using conjunction = std::conjunction<Ts...>;
 #else // Older than C++17.
-
-/// An \c integral_constant whose value is <code>(... && Ts::value)</code>. 
 template <typename... Ts>
 struct conjunction;
 
-#if THRUST_CPP_DIALECT >= 2014
-/// A <code>constexpr bool</code> whose value is <code>(... && Ts::value)</code>.
-template <typename... Ts>
-constexpr bool conjunction_v = conjunction<Ts...>::value;
-#endif
+/*! \cond
+ */
 
 template <>
 struct conjunction<> : std::true_type {};
@@ -74,18 +71,38 @@ template<typename T0, typename T1, typename T2, typename... TN>
 struct conjunction<T0, T1, T2, TN...>
   : std::conditional<T0::value, conjunction<T1, T2, TN...>, T0>::type {};
 
-///////////////////////////////////////////////////////////////////////////////
-
-/// An \c integral_constant whose value is <code>(... || Ts::value)</code>. 
-template <typename... Ts>
-struct disjunction;
+/*! \endcond
+ */
+#endif
 
+/*! \brief <tt>constexpr bool</tt> whose value is <tt>(... && Ts::value)</tt>.
+ *
+ *  \see conjunction
+ *  \see conjunction_value
+ *  \see <a href="https://en.cppreference.com/w/cpp/types/conjunction"><tt>std::conjunction</tt></a>
+ */
 #if THRUST_CPP_DIALECT >= 2014
-/// A <code>constexpr bool</code> whose value is <code>(... || Ts::value)</code>.
 template <typename... Ts>
-constexpr bool disjunction_v = disjunction<Ts...>::value;
+constexpr bool conjunction_v = conjunction<Ts...>::value;
 #endif
 
+/*! \brief <a href="https://en.cppreference.com/w/cpp/types/integral_constant"><tt>std::integral_constant</tt></a>
+ *  whose value is <tt>(... || Ts::value)</tt>.
+ *
+ *  \see disjunction_v
+ *  \see disjunction_value
+ *  \see <a href="https://en.cppreference.com/w/cpp/types/disjunction"><tt>std::disjunction</tt></a>
+ */
+#if THRUST_CPP_DIALECT >= 2017
+template <typename... Ts>
+using disjunction = std::disjunction<Ts...>;
+#else // Older than C++17.
+template <typename... Ts>
+struct disjunction;
+
+/*! \cond
+ */
+
 template <>
 struct disjunction<> : std::false_type {};
 
@@ -96,83 +113,175 @@ template <typename T0, typename... TN>
 struct disjunction<T0, TN...>
   : std::conditional<T0::value != false, T0, disjunction<TN...> >::type {};
 
-///////////////////////////////////////////////////////////////////////////////
+/*! \endcond
+ */
+#endif
 
-/// An \c integral_constant whose value is <code>!T::value</code>. 
+/*! \brief <tt>constexpr bool</tt> whose value is <tt>(... || Ts::value)</tt>.
+ *
+ *  \see disjunction
+ *  \see disjunction_value
+ *  \see <a href="https://en.cppreference.com/w/cpp/types/disjunction"><tt>std::disjunction</tt></a>
+ */
+#if THRUST_CPP_DIALECT >= 2014
+template <typename... Ts>
+constexpr bool disjunction_v = disjunction<Ts...>::value;
+#endif
+
+/*! \brief <a href="https://en.cppreference.com/w/cpp/types/integral_constant"><tt>std::integral_constant</tt></a>
+ *  whose value is <tt>!Ts::value</tt>.
+ *
+ *  \see negation_v
+ *  \see negation_value
+ *  \see <a href="https://en.cppreference.com/w/cpp/types/negation"><tt>std::negation</tt></a>
+ */
+#if THRUST_CPP_DIALECT >= 2017
+template <typename T>
+using negation = std::negation<T>;
+#else // Older than C++17.
 template <typename T>
 struct negation;
 
-#if THRUST_CPP_DIALECT >= 2014
-/// A <code>constexpr bool</code> whose value is <code>!T::value</code>.
-template <typename T>
-constexpr bool negation_v = negation<T>::value;
-#endif
+/*! \cond
+ */
 
 template <typename T>
 struct negation : std::integral_constant<bool, !T::value> {};
 
-#endif // THRUST_CPP_DIALECT >= 2017
+/*! \endcond
+ */
+#endif
+
+/*! \brief <tt>constexpr bool</tt> whose value is <tt>!Ts::value</tt>.
+ *
+ *  \see negation
+ *  \see negation_value
+ *  \see <a href="https://en.cppreference.com/w/cpp/types/negation"><tt>std::negation</tt></a>
+ */
+#if THRUST_CPP_DIALECT >= 2014
+template <typename T>
+constexpr bool negation_v = negation<T>::value;
+#endif
 
 ///////////////////////////////////////////////////////////////////////////////
 
-/// An \c integral_constant whose value is <code>(... && Bs)</code>. 
+/*! \brief <a href="https://en.cppreference.com/w/cpp/types/integral_constant"><tt>std::integral_constant</tt></a>
+ *  whose value is <tt>(... && Bs)</tt>.
+ *
+ *  \see conjunction_value_v
+ *  \see conjunction
+ *  \see <a href="https://en.cppreference.com/w/cpp/types/conjunction"><tt>std::conjunction</tt></a>
+ */
 template <bool... Bs>
 struct conjunction_value;
 
 #if THRUST_CPP_DIALECT >= 2014
-/// A <code>constexpr bool</code> whose value is <code>(... && Bs)</code>.
+/*! \brief <tt>constexpr bool</tt> whose value is <tt>(... && Bs)</tt>.
+ *
+ *  \see conjunction_value
+ *  \see conjunction
+ *  \see <a href="https://en.cppreference.com/w/cpp/types/conjunction"><tt>std::conjunction</tt></a>
+ */
 template <bool... Bs>
 constexpr bool conjunction_value_v = conjunction_value<Bs...>::value;
 #endif
 
+/*! \cond
+ */
+
 template <>
 struct conjunction_value<> : std::true_type {};
 
 template <bool B>
 struct conjunction_value<B> : std::integral_constant<bool, B> {};
 
-template <bool B0, bool... BN>
-struct conjunction_value<B0, BN...>
-  : std::integral_constant<bool, B0 && conjunction_value<BN...>::value> {};
+template <bool B, bool... Bs>
+struct conjunction_value<B, Bs...>
+  : std::integral_constant<bool, B && conjunction_value<Bs...>::value> {};
+
+/*! \endcond
+ */
 
 ///////////////////////////////////////////////////////////////////////////////
 
-/// An \c integral_constant whose value is <code>(... || Bs)</code>. 
+/*! \brief <a href="https://en.cppreference.com/w/cpp/types/integral_constant"><tt>std::integral_constant</tt></a>
+ *  whose value is <tt>(... || Bs)</tt>.
+ *
+ *  \see disjunction_value_v
+ *  \see disjunction
+ *  \see <a href="https://en.cppreference.com/w/cpp/types/disjunction"><tt>std::disjunction</tt></a>
+ */
 template <bool... Bs>
 struct disjunction_value;
 
 #if THRUST_CPP_DIALECT >= 2014
-/// A <code>constexpr bool</code> whose value is <code>(... || Bs)</code>.
+/*! \brief <tt>constexpr bool</tt> whose value is <tt>(... || Bs)</tt>.
+ *
+ *  \see disjunction_value
+ *  \see disjunction
+ *  \see <a href="https://en.cppreference.com/w/cpp/types/disjunction"><tt>std::disjunction</tt></a>
+ */
 template <bool... Bs>
 constexpr bool disjunction_value_v = disjunction_value<Bs...>::value;
 #endif
 
+/*! \cond
+ */
+
 template <>
 struct disjunction_value<> : std::false_type {};
 
 template <bool B>
 struct disjunction_value<B> : std::integral_constant<bool, B> {};
 
-template <bool B0, bool... BN>
-struct disjunction_value<B0, BN...>
-  : std::integral_constant<bool, B0 || disjunction_value<BN...>::value> {};
+template <bool B, bool... Bs>
+struct disjunction_value<B, Bs...>
+  : std::integral_constant<bool, B || disjunction_value<Bs...>::value> {};
+
+/*! \endcond
+ */
 
 ///////////////////////////////////////////////////////////////////////////////
 
-/// An \c integral_constant whose value is <code>!B</code>. 
+/*! \brief <a href="https://en.cppreference.com/w/cpp/types/integral_constant"><tt>std::integral_constant</tt></a>
+ *  whose value is <tt>!Bs</tt>.
+ *
+ *  \see negation_value_v
+ *  \see negation
+ *  \see <a href="https://en.cppreference.com/w/cpp/types/negation"><tt>std::negation</tt></a>
+ */
 template <bool B>
 struct negation_value;
 
 #if THRUST_CPP_DIALECT >= 2014
-/// A <code>constexpr bool</code> whose value is <code>!B</code>.
+/*! \brief <tt>constexpr bool</tt> whose value is <tt>!Ts::value</tt>.
+ *
+ *  \see negation_value
+ *  \see negation
+ *  \see <a href="https://en.cppreference.com/w/cpp/types/negation"><tt>std::negation</tt></a>
+ */
 template <bool B>
 constexpr bool negation_value_v = negation_value<B>::value;
 #endif
 
+/*! \cond
+ */
+
 template <bool B>
 struct negation_value : std::integral_constant<bool, !B> {};
 
-THRUST_END_NS
+/*! \endcond
+ */
+
+///////////////////////////////////////////////////////////////////////////////
+
+/*! \} // type traits
+ */
+
+/*! \} // utility
+ */
+
+THRUST_NAMESPACE_END
 
 #endif // THRUST_CPP_DIALECT >= 2011
 
diff --git a/thrust/type_traits/remove_cvref.h b/thrust/type_traits/remove_cvref.h
index ef7304478..1da2e0de3 100644
--- a/thrust/type_traits/remove_cvref.h
+++ b/thrust/type_traits/remove_cvref.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2018 NVIDIA Corporation
+ *  Copyright 2018-2021 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -14,34 +14,84 @@
  *  limitations under the License.
  */
 
+/*! \file
+ *  \brief C++20's
+ *  <a href="https://en.cppreference.com/w/cpp/types/remove_cvref">std::remove_cvref</a>.
+ */
+
 #pragma once
 
 #include <thrust/detail/config.h>
-#include <thrust/detail/type_traits.h>
 
-THRUST_BEGIN_NS
+#if  THRUST_CPP_DIALECT >= 2017
+#if __has_include(<version>)
+#  include <version>
+#endif
+#endif
 
-#if THRUST_CPP_DIALECT >= 2020
+#include <type_traits>
 
-using std::remove_cvref;
-using std::remove_cvref_t;
+THRUST_NAMESPACE_BEGIN
 
-#else // Older than C++20.
+/*! \addtogroup utility
+ *  \{
+ */
 
+/*! \addtogroup type_traits Type Traits
+ *  \{
+ */
+
+/*! \brief <a href="https://en.cppreference.com/w/cpp/named_req/UnaryTypeTrait"><i>UnaryTypeTrait</i></a>
+ *  that removes
+ *  <a href="https://en.cppreference.com/w/cpp/language/cv">const-volatile qualifiers</a>
+ *  and
+ *  <a href="https://en.cppreference.com/w/cpp/language/reference">references</a>
+ *  from \c T.
+ *  Equivalent to \c remove_cv_t<remove_reference_t<T>>.
+ *
+ *  \see <a href="https://en.cppreference.com/w/cpp/types/remove_cvref">std::remove_cvref</a>
+ *  \see <a href="https://en.cppreference.com/w/cpp/types/remove_cv">std::remove_cv</a>
+ *  \see <a href="https://en.cppreference.com/w/cpp/types/remove_cv">std::remove_const</a>
+ *  \see <a href="https://en.cppreference.com/w/cpp/types/remove_cv">std::remove_volatile</a>
+ *  \see <a href="https://en.cppreference.com/w/cpp/types/remove_cv">std::remove_reference</a>
+ */
+#if defined(__cpp_lib_remove_cvref) && (__cpp_lib_remove_cvref >= 201711L)
+using std::remove_cvref;
+#else // Older than C++20.
 template <typename T>
 struct remove_cvref
 {
-  typedef typename detail::remove_cv<
-    typename detail::remove_reference<T>::type
-  >::type type;
+  using type = typename std::remove_cv<
+    typename std::remove_reference<T>::type
+  >::type;
 };
+#endif
 
-#if THRUST_CPP_DIALECT >= 2011
+/*! \brief Type alias that removes
+ *  <a href="https://en.cppreference.com/w/cpp/language/cv">const-volatile qualifiers</a>
+ *  and
+ *  <a href="https://en.cppreference.com/w/cpp/language/reference">references</a>
+ *  from \c T.
+ *  Equivalent to \c remove_cv_t<remove_reference_t<T>>.
+ *
+ *  \see <a href="https://en.cppreference.com/w/cpp/types/remove_cvref">std::remove_cvref</a>
+ *  \see <a href="https://en.cppreference.com/w/cpp/types/remove_cv">std::remove_cv</a>
+ *  \see <a href="https://en.cppreference.com/w/cpp/types/remove_cv">std::remove_const</a>
+ *  \see <a href="https://en.cppreference.com/w/cpp/types/remove_cv">std::remove_volatile</a>
+ *  \see <a href="https://en.cppreference.com/w/cpp/types/remove_cv">std::remove_reference</a>
+ */
+#if defined(__cpp_lib_remove_cvref) && (__cpp_lib_remove_cvref >= 201711L)
+using std::remove_cvref_t;
+#else // Older than C++20.
 template <typename T>
 using remove_cvref_t = typename remove_cvref<T>::type;
 #endif
 
-#endif // THRUST_CPP_DIALECT >= 2020
+/*! \} // type traits
+ */
+
+/*! \} // utility
+ */
 
-THRUST_END_NS
+THRUST_NAMESPACE_END
 
diff --git a/thrust/type_traits/void_t.h b/thrust/type_traits/void_t.h
index 8550cc15b..ed12d861d 100644
--- a/thrust/type_traits/void_t.h
+++ b/thrust/type_traits/void_t.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2018 NVIDIA Corporation
+ *  Copyright 2018-2021 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -14,8 +14,8 @@
  *  limitations under the License.
  */
 
-/*! \file void_t.h
- *  \brief C++17's `void_t`. 
+/*! \file
+ *  \brief C++17's `void_t`.
  */
 
 #pragma once
@@ -26,7 +26,15 @@
 #  include <type_traits>
 #endif
 
-THRUST_BEGIN_NS
+THRUST_NAMESPACE_BEGIN
+
+/*! \addtogroup utility
+ *  \{
+ */
+
+/*! \addtogroup type_traits Type Traits
+ *  \{
+ */
 
 #if THRUST_CPP_DIALECT >= 2011
 
@@ -59,5 +67,11 @@ struct voider
 
 #endif
 
-THRUST_END_NS
+/*! \} // type traits
+ */
+
+/*! \} // utility
+ */
+
+THRUST_NAMESPACE_END
 
diff --git a/thrust/uninitialized_copy.h b/thrust/uninitialized_copy.h
index af0f641a7..94c2763e3 100644
--- a/thrust/uninitialized_copy.h
+++ b/thrust/uninitialized_copy.h
@@ -24,9 +24,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/detail/execution_policy.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup copying
  *  \{
@@ -52,8 +50,8 @@ namespace thrust
  *  \return An iterator pointing to the last element of the output range.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>.
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          \p ForwardIterator is mutable, and \p ForwardIterator's \c value_type has a constructor that takes
  *          a single argument whose type is \p InputIterator's \c value_type.
  *
@@ -87,7 +85,7 @@ namespace thrust
  *  // x.val == 46 for all 0 <= i < N
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/uninitialized_copy.html
+ *  \see https://en.cppreference.com/w/cpp/memory/uninitialized_copy
  *  \see \c copy
  *  \see \c uninitialized_fill
  *  \see \c device_new
@@ -116,8 +114,8 @@ __host__ __device__
  *  \param result The first element of the output range to copy to.
  *  \return An iterator pointing to the last element of the output range.
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>.
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          \p ForwardIterator is mutable, and \p ForwardIterator's \c value_type has a constructor that takes
  *          a single argument whose type is \p InputIterator's \c value_type.
  *
@@ -149,7 +147,7 @@ __host__ __device__
  *  // x.val == 46 for all 0 <= i < N
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/uninitialized_copy.html
+ *  \see https://en.cppreference.com/w/cpp/memory/uninitialized_copy
  *  \see \c copy
  *  \see \c uninitialized_fill
  *  \see \c device_new
@@ -180,9 +178,9 @@ template<typename InputIterator, typename ForwardIterator>
  *  \return An iterator pointing to the last element of the output range.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>.
  *  \tparam Size is an integral type.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          \p ForwardIterator is mutable, and \p ForwardIterator's \c value_type has a constructor that takes
  *          a single argument whose type is \p InputIterator's \c value_type.
  *
@@ -216,7 +214,7 @@ template<typename InputIterator, typename ForwardIterator>
  *  // x.val == 46 for all 0 <= i < N
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/uninitialized_copy.html
+ *  \see https://en.cppreference.com/w/cpp/memory/uninitialized_copy
  *  \see \c uninitialized_copy
  *  \see \c copy
  *  \see \c uninitialized_fill
@@ -246,9 +244,9 @@ __host__ __device__
  *  \param result The first element of the output range to copy to.
  *  \return An iterator pointing to the last element of the output range.
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>.
  *  \tparam Size is an integral type.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          \p ForwardIterator is mutable, and \p ForwardIterator's \c value_type has a constructor that takes
  *          a single argument whose type is \p InputIterator's \c value_type.
  *
@@ -280,7 +278,7 @@ __host__ __device__
  *  // x.val == 46 for all 0 <= i < N
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/uninitialized_copy.html
+ *  \see https://en.cppreference.com/w/cpp/memory/uninitialized_copy
  *  \see \c uninitialized_copy
  *  \see \c copy
  *  \see \c uninitialized_fill
@@ -296,8 +294,6 @@ template<typename InputIterator, typename Size, typename ForwardIterator>
 /*! \} // copying
  */
 
-
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/uninitialized_copy.inl>
-
diff --git a/thrust/uninitialized_fill.h b/thrust/uninitialized_fill.h
index 33dc24886..b46758a3c 100644
--- a/thrust/uninitialized_fill.h
+++ b/thrust/uninitialized_fill.h
@@ -24,9 +24,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/detail/execution_policy.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup filling
  *  \ingroup transformations
@@ -51,7 +49,7 @@ namespace thrust
  *  \param x The value to use as the exemplar of the copy constructor.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          \p ForwardIterator is mutable, and \p ForwardIterator's \c value_type has a constructor that
  *          takes a single argument of type \p T.
  *
@@ -80,7 +78,7 @@ namespace thrust
  *  // x.val == 46 for all 0 <= i < N
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/uninitialized_fill.html
+ *  \see https://en.cppreference.com/w/cpp/memory/uninitialized_fill
  *  \see \c uninitialized_fill_n
  *  \see \c fill
  *  \see \c uninitialized_copy
@@ -108,7 +106,7 @@ __host__ __device__
  *  \param last The last element of the range of interest.
  *  \param x The value to use as the exemplar of the copy constructor.
  *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          \p ForwardIterator is mutable, and \p ForwardIterator's \c value_type has a constructor that
  *          takes a single argument of type \p T.
  *
@@ -136,7 +134,7 @@ __host__ __device__
  *  // x.val == 46 for all 0 <= i < N
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/uninitialized_fill.html
+ *  \see https://en.cppreference.com/w/cpp/memory/uninitialized_fill
  *  \see \c uninitialized_fill_n
  *  \see \c fill
  *  \see \c uninitialized_copy
@@ -167,7 +165,7 @@ template<typename ForwardIterator, typename T>
  *  \return <tt>first+n</tt>
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          \p ForwardIterator is mutable, and \p ForwardIterator's \c value_type has a constructor that
  *          takes a single argument of type \p T.
  *
@@ -196,7 +194,7 @@ template<typename ForwardIterator, typename T>
  *  // x.val == 46 for all 0 <= i < N
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/uninitialized_fill.html
+ *  \see https://en.cppreference.com/w/cpp/memory/uninitialized_fill
  *  \see \c uninitialized_fill
  *  \see \c fill
  *  \see \c uninitialized_copy_n
@@ -225,7 +223,7 @@ __host__ __device__
  *  \param x The value to use as the exemplar of the copy constructor.
  *  \return <tt>first+n</tt>
  *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          \p ForwardIterator is mutable, and \p ForwardIterator's \c value_type has a constructor that
  *          takes a single argument of type \p T.
  *
@@ -253,7 +251,7 @@ __host__ __device__
  *  // x.val == 46 for all 0 <= i < N
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/uninitialized_fill.html
+ *  \see https://en.cppreference.com/w/cpp/memory/uninitialized_fill
  *  \see \c uninitialized_fill
  *  \see \c fill
  *  \see \c uninitialized_copy_n
@@ -269,7 +267,6 @@ template<typename ForwardIterator, typename Size, typename T>
  *  \} // transformations
  */
 
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/uninitialized_fill.inl>
-
diff --git a/thrust/unique.h b/thrust/unique.h
index b4b2118d3..234cd4935 100644
--- a/thrust/unique.h
+++ b/thrust/unique.h
@@ -23,11 +23,10 @@
 
 #include <thrust/detail/config.h>
 #include <thrust/detail/execution_policy.h>
+#include <thrust/iterator/iterator_traits.h>
 #include <thrust/pair.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup stream_compaction
  *  \{
@@ -53,9 +52,9 @@ namespace thrust
  *  \return The end of the unique range <tt>[first, new_last)</tt>.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator is mutable,
- *          and \p ForwardIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
+ *          and \p ForwardIterator's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/concepts/equality_comparable">Equality Comparable</a>.
  *
  *  The following code snippet demonstrates how to use \p unique to
  *  compact a sequence of numbers to remove consecutive duplicates using the \p thrust::host execution policy
@@ -72,7 +71,7 @@ namespace thrust
  *  // Values beyond new_end are unspecified.
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/unique.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/unique
  *  \see unique_copy
  */
 template<typename DerivedPolicy,
@@ -98,9 +97,9 @@ ForwardIterator unique(const thrust::detail::execution_policy_base<DerivedPolicy
  *  \param last  The end of the input range.
  *  \return The end of the unique range <tt>[first, new_last)</tt>.
  *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator is mutable,
- *          and \p ForwardIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
+ *          and \p ForwardIterator's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/concepts/equality_comparable">Equality Comparable</a>.
  *
  *  The following code snippet demonstrates how to use \p unique to
  *  compact a sequence of numbers to remove consecutive duplicates.
@@ -115,7 +114,7 @@ ForwardIterator unique(const thrust::detail::execution_policy_base<DerivedPolicy
  *  // Values beyond new_end are unspecified.
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/unique.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/unique
  *  \see unique_copy
  */
 template<typename ForwardIterator>
@@ -144,10 +143,10 @@ ForwardIterator unique(ForwardIterator first,
  *  \return The end of the unique range <tt>[first, new_last)</tt>
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator is mutable,
  *          and \p ForwardIterator's \c value_type is convertible to \p BinaryPredicate's \c first_argument_type and to \p BinaryPredicate's \c second_argument_type.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *  \tparam BinaryPredicate is a model of <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">Binary Predicate</a>.
  *
  *  The following code snippet demonstrates how to use \p unique to
  *  compact a sequence of numbers to remove consecutive duplicates using the \p thrust::host execution policy
@@ -164,7 +163,7 @@ ForwardIterator unique(ForwardIterator first,
  *  // Values beyond new_end are unspecified.
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/unique.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/unique
  *  \see unique_copy
  */
 template<typename DerivedPolicy,
@@ -194,10 +193,10 @@ ForwardIterator unique(const thrust::detail::execution_policy_base<DerivedPolicy
  *  \param binary_pred  The binary predicate used to determine equality.
  *  \return The end of the unique range <tt>[first, new_last)</tt>
  *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator is mutable,
  *          and \p ForwardIterator's \c value_type is convertible to \p BinaryPredicate's \c first_argument_type and to \p BinaryPredicate's \c second_argument_type.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *  \tparam BinaryPredicate is a model of <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">Binary Predicate</a>.
  *
  *  The following code snippet demonstrates how to use \p unique to
  *  compact a sequence of numbers to remove consecutive duplicates.
@@ -212,7 +211,7 @@ ForwardIterator unique(const thrust::detail::execution_policy_base<DerivedPolicy
  *  // Values beyond new_end are unspecified.
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/unique.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/unique
  *  \see unique_copy
  */
 template<typename ForwardIterator,
@@ -248,9 +247,9 @@ ForwardIterator unique(ForwardIterator first,
  *  \return The end of the unique range <tt>[result, result_end)</tt>.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/concepts/equality_comparable">Equality Comparable</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a> and
  *          and \p InputIterator's \c value_type is convertible to \c OutputIterator's \c value_type.
  *
  *  \pre The range <tt>[first,last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap.
@@ -272,7 +271,7 @@ ForwardIterator unique(ForwardIterator first,
  *  \endcode
  *
  *  \see unique
- *  \see http://www.sgi.com/tech/stl/unique_copy.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/unique_copy
  */
 template<typename DerivedPolicy,
          typename InputIterator,
@@ -306,9 +305,9 @@ OutputIterator unique_copy(const thrust::detail::execution_policy_base<DerivedPo
  *  \param result The beginning of the output range.
  *  \return The end of the unique range <tt>[result, result_end)</tt>.
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/concepts/equality_comparable">Equality Comparable</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a> and
  *          and \p InputIterator's \c value_type is convertible to \c OutputIterator's \c value_type.
  *
  *  \pre The range <tt>[first,last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap.
@@ -328,7 +327,7 @@ OutputIterator unique_copy(const thrust::detail::execution_policy_base<DerivedPo
  *  \endcode
  *
  *  \see unique
- *  \see http://www.sgi.com/tech/stl/unique_copy.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/unique_copy
  */
 template<typename InputIterator,
          typename OutputIterator>
@@ -355,11 +354,11 @@ OutputIterator unique_copy(InputIterator first,
  *  \return The end of the unique range <tt>[result, result_end)</tt>.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/concepts/equality_comparable">Equality Comparable</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a> and
  *          and \p InputIterator's \c value_type is convertible to \c OutputIterator's \c value_type.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *  \tparam BinaryPredicate is a model of <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">Binary Predicate</a>.
  *
  *  \pre The range <tt>[first,last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap.
  *
@@ -380,7 +379,7 @@ OutputIterator unique_copy(InputIterator first,
  *  \endcode
  *
  *  \see unique
- *  \see http://www.sgi.com/tech/stl/unique_copy.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/unique_copy
  */
 template<typename DerivedPolicy,
          typename InputIterator,
@@ -408,11 +407,11 @@ OutputIterator unique_copy(const thrust::detail::execution_policy_base<DerivedPo
  *  \param binary_pred  The binary predicate used to determine equality.
  *  \return The end of the unique range <tt>[result, result_end)</tt>.
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/concepts/equality_comparable">Equality Comparable</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a> and
  *          and \p InputIterator's \c value_type is convertible to \c OutputIterator's \c value_type.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *  \tparam BinaryPredicate is a model of <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">Binary Predicate</a>.
  *
  *  \pre The range <tt>[first,last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap.
  *
@@ -431,7 +430,7 @@ OutputIterator unique_copy(const thrust::detail::execution_policy_base<DerivedPo
  *  \endcode
  *
  *  \see unique
- *  \see http://www.sgi.com/tech/stl/unique_copy.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/unique_copy
  */
 template<typename InputIterator,
          typename OutputIterator,
@@ -465,10 +464,10 @@ OutputIterator unique_copy(InputIterator first,
  *  \return A pair of iterators at end of the ranges <tt>[key_first, keys_new_last)</tt> and <tt>[values_first, values_new_last)</tt>.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator1 is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator1 is mutable,
- *          and \p ForwardIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
- *  \tparam ForwardIterator2 is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/concepts/equality_comparable">Equality Comparable</a>.
+ *  \tparam ForwardIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator2 is mutable.
  *
  *  \pre The range <tt>[keys_first, keys_last)</tt> and the range <tt>[values_first, values_first + (keys_last - keys_first))</tt> shall not overlap.
@@ -526,10 +525,10 @@ __host__ __device__
  *  \param values_first The beginning of the value range.
  *  \return A pair of iterators at end of the ranges <tt>[key_first, keys_new_last)</tt> and <tt>[values_first, values_new_last)</tt>.
  *
- *  \tparam ForwardIterator1 is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator1 is mutable,
- *          and \p ForwardIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
- *  \tparam ForwardIterator2 is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/concepts/equality_comparable">Equality Comparable</a>.
+ *  \tparam ForwardIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator2 is mutable.
  *
  *  \pre The range <tt>[keys_first, keys_last)</tt> and the range <tt>[values_first, values_first + (keys_last - keys_first))</tt> shall not overlap.
@@ -583,12 +582,12 @@ template<typename ForwardIterator1,
  *  \return The end of the unique range <tt>[first, new_last)</tt>.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator1 is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator1 is mutable,
- *          and \p ForwardIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
- *  \tparam ForwardIterator2 is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/concepts/equality_comparable">Equality Comparable</a>.
+ *  \tparam ForwardIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator2 is mutable.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *  \tparam BinaryPredicate is a model of <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">Binary Predicate</a>.
  *
  *  \pre The range <tt>[keys_first, keys_last)</tt> and the range <tt>[values_first, values_first + (keys_last - keys_first))</tt> shall not overlap.
  *
@@ -645,12 +644,12 @@ __host__ __device__
  *  \param binary_pred  The binary predicate used to determine equality.
  *  \return The end of the unique range <tt>[first, new_last)</tt>.
  *
- *  \tparam ForwardIterator1 is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator1 is mutable,
- *          and \p ForwardIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
- *  \tparam ForwardIterator2 is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/concepts/equality_comparable">Equality Comparable</a>.
+ *  \tparam ForwardIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator2 is mutable.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *  \tparam BinaryPredicate is a model of <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">Binary Predicate</a>.
  *
  *  \pre The range <tt>[keys_first, keys_last)</tt> and the range <tt>[values_first, values_first + (keys_last - keys_first))</tt> shall not overlap.
  *
@@ -707,11 +706,11 @@ template<typename ForwardIterator1,
  *  \return A pair of iterators at end of the ranges <tt>[keys_result, keys_result_last)</tt> and <tt>[values_result, values_result_last)</tt>.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a> and
  *          and \p InputIterator1's \c value_type is convertible to \c OutputIterator1's \c value_type.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a> and
  *          and \p InputIterator2's \c value_type is convertible to \c OutputIterator2's \c value_type.
  *
  *  \pre The input ranges shall not overlap either output range.
@@ -773,11 +772,11 @@ __host__ __device__
  *  \param values_result The beginning of the output value range.
  *  \return A pair of iterators at end of the ranges <tt>[keys_result, keys_result_last)</tt> and <tt>[values_result, values_result_last)</tt>.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a> and
  *          and \p InputIterator1's \c value_type is convertible to \c OutputIterator1's \c value_type.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a> and
  *          and \p InputIterator2's \c value_type is convertible to \c OutputIterator2's \c value_type.
  *
  *  \pre The input ranges shall not overlap either output range.
@@ -839,13 +838,13 @@ template<typename InputIterator1,
  *  \return A pair of iterators at end of the ranges <tt>[keys_result, keys_result_last)</tt> and <tt>[values_result, values_result_last)</tt>.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a> and
  *          and \p InputIterator1's \c value_type is convertible to \c OutputIterator1's \c value_type.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a> and
  *          and \p InputIterator2's \c value_type is convertible to \c OutputIterator2's \c value_type.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *  \tparam BinaryPredicate is a model of <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">Binary Predicate</a>.
  *
  *  \pre The input ranges shall not overlap either output range.
  *
@@ -910,13 +909,13 @@ __host__ __device__
  *  \param binary_pred  The binary predicate used to determine equality.
  *  \return A pair of iterators at end of the ranges <tt>[keys_result, keys_result_last)</tt> and <tt>[values_result, values_result_last)</tt>.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a> and
  *          and \p InputIterator1's \c value_type is convertible to \c OutputIterator1's \c value_type.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a> and
  *          and \p InputIterator2's \c value_type is convertible to \c OutputIterator2's \c value_type.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *  \tparam BinaryPredicate is a model of <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">Binary Predicate</a>.
  *
  *  \pre The input ranges shall not overlap either output range.
  *
@@ -958,11 +957,184 @@ template<typename InputIterator1,
                      BinaryPredicate binary_pred);
 
 
-/*! \} // end stream_compaction
+/*! \p unique_count counts runs of equal elements in the range <tt>[first, last)</tt>
+ *  with the same value, 
+ *
+ *  This version of \p unique_count uses the function object \p binary_pred to test for equality.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the input range.
+ *  \param last  The end of the input range.
+ *  \param binary_pred  The binary predicate used to determine equality.
+ *  \return The number of runs of equal elements in <tt>[first, new_last)</tt>
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
+ *          and \p ForwardIterator's \c value_type is convertible to \p BinaryPredicate's \c first_argument_type and to \p BinaryPredicate's \c second_argument_type.
+ *  \tparam BinaryPredicate is a model of <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">Binary Predicate</a>.
+ *
+ *  The following code snippet demonstrates how to use \p unique_count to
+ *  determine a number of runs of equal elements using the \p thrust::host execution policy
+ *  for parallelization:
+ *
+ *  \code
+ *  #include <thrust/unique.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  const int N = 7;
+ *  int A[N] = {1, 3, 3, 3, 2, 2, 1};
+ *  int count = thrust::unique_count(thrust::host, A, A + N, thrust::equal_to<int>());
+ *  // count is now 4
+ *  \endcode
+ *
+ *  \see unique_copy
+ *  \see unique_by_key_copy
+ *  \see reduce_by_key_copy
+ */
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename BinaryPredicate>
+__host__ __device__
+  typename thrust::iterator_traits<ForwardIterator>::difference_type
+    unique_count(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                 ForwardIterator first,
+                 ForwardIterator last,
+                 BinaryPredicate binary_pred);
+
+
+/*! \p unique_count counts runs of equal elements in the range <tt>[first, last)</tt>
+ *  with the same value, 
+ *
+ *  This version of \p unique_count uses \c operator== to test for equality.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the input range.
+ *  \param last  The end of the input range.
+ *  \param binary_pred  The binary predicate used to determine equality.
+ *  \return The number of runs of equal elements in <tt>[first, new_last)</tt>
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
+ *          and \p ForwardIterator's \c value_type is convertible to \p BinaryPredicate's \c first_argument_type and to \p BinaryPredicate's \c second_argument_type.
+ *  \tparam BinaryPredicate is a model of <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">Binary Predicate</a>.
+ *
+ *  The following code snippet demonstrates how to use \p unique_count to
+ *  determine the number of runs of equal elements using the \p thrust::host execution policy
+ *  for parallelization:
+ *
+ *  \code
+ *  #include <thrust/unique.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  const int N = 7;
+ *  int A[N] = {1, 3, 3, 3, 2, 2, 1};
+ *  int count = thrust::unique_count(thrust::host, A, A + N);
+ *  // count is now 4
+ *  \endcode
+ *
+ *  \see unique_copy
+ *  \see unique_by_key_copy
+ *  \see reduce_by_key_copy
+ */
+template<typename DerivedPolicy,
+         typename ForwardIterator>
+__host__ __device__
+  typename thrust::iterator_traits<ForwardIterator>::difference_type
+    unique_count(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                 ForwardIterator first,
+                 ForwardIterator last);
+
+
+/*! \p unique_count counts runs of equal elements in the range <tt>[first, last)</tt>
+ *  with the same value, 
+ *
+ *  This version of \p unique_count uses the function object \p binary_pred to test for equality.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the input range.
+ *  \param last  The end of the input range.
+ *  \param binary_pred  The binary predicate used to determine equality.
+ *  \return The number of runs of equal elements in <tt>[first, new_last)</tt>
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
+ *          and \p ForwardIterator's \c value_type is convertible to \p BinaryPredicate's \c first_argument_type and to \p BinaryPredicate's \c second_argument_type.
+ *  \tparam BinaryPredicate is a model of <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">Binary Predicate</a>.
+ *
+ *  The following code snippet demonstrates how to use \p unique_count to
+ *  determine the number of runs of equal elements:
+ *
+ *  \code
+ *  #include <thrust/unique.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  const int N = 7;
+ *  int A[N] = {1, 3, 3, 3, 2, 2, 1};
+ *  int count = thrust::unique_count(A, A + N, thrust::equal_to<int>());
+ *  // count is now 4
+ *  \endcode
+ *
+ *  \see unique_copy
+ *  \see unique_by_key_copy
+ *  \see reduce_by_key_copy
+ */
+template<typename ForwardIterator,
+         typename BinaryPredicate>
+__host__ __device__
+  typename thrust::iterator_traits<ForwardIterator>::difference_type
+    unique_count(ForwardIterator first,
+                 ForwardIterator last,
+                 BinaryPredicate binary_pred);
+
+
+/*! \p unique_count counts runs of equal elements in the range <tt>[first, last)</tt>
+ *  with the same value, 
+ *
+ *  This version of \p unique_count uses \c operator== to test for equality.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the input range.
+ *  \param last  The end of the input range.
+ *  \param binary_pred  The binary predicate used to determine equality.
+ *  \return The number of runs of equal elements in <tt>[first, new_last)</tt>
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
+ *          and \p ForwardIterator's \c value_type is convertible to \p BinaryPredicate's \c first_argument_type and to \p BinaryPredicate's \c second_argument_type.
+ *  \tparam BinaryPredicate is a model of <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">Binary Predicate</a>.
+ *
+ *  The following code snippet demonstrates how to use \p unique_count to
+ *  determine the number of runs of equal elements:
+ *
+ *  \code
+ *  #include <thrust/unique.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  const int N = 7;
+ *  int A[N] = {1, 3, 3, 3, 2, 2, 1};
+ *  int count = thrust::unique_count(thrust::host, A, A + N);
+ *  // count is now 4
+ *  \endcode
+ *
+ *  \see unique_copy
+ *  \see unique_by_key_copy
+ *  \see reduce_by_key_copy
  */
+template<typename ForwardIterator>
+__host__ __device__
+  typename thrust::iterator_traits<ForwardIterator>::difference_type
+    unique_count(ForwardIterator first,
+                 ForwardIterator last);
 
 
-} // end namespace thrust
+/*! \} // end stream_compaction
+ */
+
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/unique.inl>
 
diff --git a/thrust/universal_allocator.h b/thrust/universal_allocator.h
new file mode 100644
index 000000000..8d85cd20d
--- /dev/null
+++ b/thrust/universal_allocator.h
@@ -0,0 +1,77 @@
+/*
+ *  Copyright 2008-2020 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file universal_allocator.h
+ *  \brief An allocator which creates new elements in memory accessible to both
+ *         hosts and devices.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// #include the device system's vector header
+#define __THRUST_DEVICE_SYSTEM_MEMORY_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/memory.h>
+#include __THRUST_DEVICE_SYSTEM_MEMORY_HEADER
+#undef __THRUST_DEVICE_SYSTEM_MEMORY_HEADER
+
+THRUST_NAMESPACE_BEGIN
+
+/** \addtogroup memory_resources Memory Resources
+ *  \ingroup memory_management_classes
+ *  \{
+ */
+
+/*! \brief An allocator which creates new elements in memory accessible by
+ *         both hosts and devices.
+ *
+ *  \see https://en.cppreference.com/w/cpp/named_req/Allocator
+ */
+using thrust::system::__THRUST_DEVICE_SYSTEM_NAMESPACE::universal_allocator;
+
+/*! \p universal_ptr stores a pointer to an object allocated in memory accessible
+ *  to both hosts and devices.
+ *
+ *  Algorithms dispatched with this type of pointer will be dispatched to
+ *  either host or device, depending on which backend you are using. Explicit
+ *  policies (\p thrust::device, etc) can be used to specify where an algorithm
+ *  should be run.
+ *
+ *  \p universal_ptr has pointer semantics: it may be dereferenced safely from
+ *  both hosts and devices and may be manipulated with pointer arithmetic.
+ *
+ *  \p universal_ptr can be created with \p universal_allocator or by explicitly
+ *  calling its constructor with a raw pointer.
+ *
+ *  The raw pointer encapsulated by a \p universal_ptr may be obtained by
+ *  either its <tt>get</tt> method or the \p raw_pointer_cast free function.
+ *
+ *  \note \p universal_ptr is not a smart pointer; it is the programmer's
+ *  responsibility to deallocate memory pointed to by \p universal_ptr.
+ *
+ *  \see host_ptr For the documentation of the complete interface which is
+ *                shared by \p universal_ptr.
+ *  \see raw_pointer_cast
+ */
+template <typename T>
+using universal_ptr =
+  thrust::system::__THRUST_DEVICE_SYSTEM_NAMESPACE::universal_pointer<T>;
+
+/*! \}
+ */
+
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/host_vector.inl b/thrust/universal_ptr.h
similarity index 57%
rename from thrust/detail/host_vector.inl
rename to thrust/universal_ptr.h
index e424dd1e1..9d1de19d5 100644
--- a/thrust/detail/host_vector.inl
+++ b/thrust/universal_ptr.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2020 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -15,24 +15,12 @@
  */
 
 
-/*! \file host_vector.inl
- *  \brief Inline file for host_vector.h.
+/*! \file universal_ptr.h
+ *  \brief A pointer to a variable which resides memory accessible to both
+ *         hosts and devices.
  */
 
-#include <thrust/host_vector.h>
+#pragma once
 
-namespace thrust
-{
-
-template<typename T, typename Alloc>
-  template<typename OtherT, typename OtherAlloc>
-    __host__
-    host_vector<T,Alloc>
-      ::host_vector(const device_vector<OtherT,OtherAlloc> &v)
-        :Parent(v)
-{
-  ;
-} // end host_vector::host_vector()
-
-} // end namespace thrust
+#include <thrust/universal_allocator.h>
 
diff --git a/thrust/universal_vector.h b/thrust/universal_vector.h
new file mode 100644
index 000000000..0ce38fd86
--- /dev/null
+++ b/thrust/universal_vector.h
@@ -0,0 +1,55 @@
+/*
+ *  Copyright 2008-2020 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file
+ *  \brief A dynamically-sizable array of elements which resides in memory
+ *         accessible to both hosts and devices.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/universal_allocator.h>
+
+// #include the device system's vector header
+#define __THRUST_DEVICE_SYSTEM_VECTOR_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/vector.h>
+#include __THRUST_DEVICE_SYSTEM_VECTOR_HEADER
+#undef __THRUST_DEVICE_SYSTEM_VECTOR_HEADER
+
+THRUST_NAMESPACE_BEGIN
+
+/*! \addtogroup containers Containers
+ *  \{
+ */
+
+/*! A \p universal_vector is a container that supports random access to elements,
+ *  constant time removal of elements at the end, and linear time insertion
+ *  and removal of elements at the beginning or in the middle. The number of
+ *  elements in a \p universal_vector may vary dynamically; memory management is
+ *  automatic. The memory associated with a \p universal_vector resides in memory
+ *  accessible to hosts and devices.
+ *
+ *  \see https://en.cppreference.com/w/cpp/container/vector
+ *  \see host_vector For the documentation of the complete interface which is
+ *                   shared by \p universal_vector.
+ *  \see device_vector
+ */
+using thrust::system::__THRUST_DEVICE_SYSTEM_NAMESPACE::universal_vector;
+
+/*! \} // containers
+ */
+
+THRUST_NAMESPACE_END
diff --git a/thrust/version.h b/thrust/version.h
index eec81f3eb..71f1adb69 100644
--- a/thrust/version.h
+++ b/thrust/version.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2022 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -47,7 +47,7 @@
  *         <tt>THRUST_VERSION / 100 % 1000</tt> is the minor version.
  *         <tt>THRUST_VERSION / 100000</tt> is the major version.
  */
-#define THRUST_VERSION 100906
+#define THRUST_VERSION 200200
 
 /*! \def THRUST_MAJOR_VERSION
  *  \brief The preprocessor macro \p THRUST_MAJOR_VERSION encodes the
@@ -70,26 +70,6 @@
 /*! \def THRUST_PATCH_NUMBER
  *  \brief The preprocessor macro \p THRUST_PATCH_NUMBER encodes the
  *         patch number of the Thrust library.
+ *         Legacy; will be 0 for all future releases.
  */
 #define THRUST_PATCH_NUMBER 0
-
-
-// Declare these namespaces here for the purpose of Doxygenating them
-
-/*! \namespace thrust
- *  \brief \p thrust is the top-level namespace which contains all Thrust
- *         functions and types.
- */
-namespace thrust
-{
-
-}
-
-#ifndef THRUST_BEGIN_NS
-#define THRUST_BEGIN_NS namespace thrust {
-#endif
-
-#ifndef THRUST_END_NS
-#define THRUST_END_NS }
-#endif
-
diff --git a/thrust/zip_function.h b/thrust/zip_function.h
new file mode 100644
index 000000000..7653f9b7f
--- /dev/null
+++ b/thrust/zip_function.h
@@ -0,0 +1,212 @@
+
+/*! \file thrust/zip_function.h
+ *  \brief Adaptor type that turns an N-ary function object into one that takes
+ *         a tuple of size N so it can easily be used with algorithms taking zip
+ *         iterators
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp11_required.h>
+#include <thrust/detail/modern_gcc_required.h>
+
+#if THRUST_CPP_DIALECT >= 2011 && !defined(THRUST_LEGACY_GCC)
+
+#include <thrust/tuple.h>
+#include <thrust/type_traits/integer_sequence.h>
+#include <thrust/detail/type_deduction.h>
+
+THRUST_NAMESPACE_BEGIN
+
+/*! \addtogroup function_objects Function Objects
+ *  \{
+ */
+
+/*! \addtogroup function_object_adaptors Function Object Adaptors
+ *  \ingroup function_objects
+ *  \{
+ */
+
+namespace detail {
+namespace zip_detail {
+
+// Add workaround for decltype(auto) on C++11-only compilers:
+#if THRUST_CPP_DIALECT >= 2014
+
+__thrust_exec_check_disable__
+template <typename Function, typename Tuple, std::size_t... Is>
+__host__ __device__
+decltype(auto) apply_impl(Function&& func, Tuple&& args, index_sequence<Is...>)
+{
+  return func(thrust::get<Is>(THRUST_FWD(args))...);
+}
+
+template <typename Function, typename Tuple>
+__host__ __device__
+decltype(auto) apply(Function&& func, Tuple&& args)
+{
+  constexpr auto tuple_size = thrust::tuple_size<typename std::decay<Tuple>::type>::value;
+  return apply_impl(THRUST_FWD(func), THRUST_FWD(args), make_index_sequence<tuple_size>{});
+}
+
+#else // THRUST_CPP_DIALECT
+
+__thrust_exec_check_disable__
+template <typename Function, typename Tuple, std::size_t... Is>
+__host__ __device__
+auto apply_impl(Function&& func, Tuple&& args, index_sequence<Is...>)
+THRUST_DECLTYPE_RETURNS(func(thrust::get<Is>(THRUST_FWD(args))...))
+
+template <typename Function, typename Tuple>
+__host__ __device__
+auto apply(Function&& func, Tuple&& args)
+THRUST_DECLTYPE_RETURNS(
+    apply_impl(
+      THRUST_FWD(func),
+      THRUST_FWD(args),
+      make_index_sequence<
+        thrust::tuple_size<typename std::decay<Tuple>::type>::value>{})
+)
+
+#endif // THRUST_CPP_DIALECT
+
+} // namespace zip_detail
+} // namespace detail
+
+/*! \p zip_function is a function object that allows the easy use of N-ary
+ *  function objects with \p zip_iterators without redefining them to take a
+ *  \p tuple instead of N arguments.
+ *
+ *  This means that if a functor that takes 2 arguments which could be used with
+ *  the \p transform function and \p device_iterators can be extended to take 3
+ *  arguments and \p zip_iterators without rewriting the functor in terms of
+ *  \p tuple.
+ *
+ *  The \p make_zip_function convenience function is provided to avoid having
+ *  to explicitely define the type of the functor when creating a \p zip_function,
+ *  whic is especially helpful when using lambdas as the functor.
+ *
+ *  \code
+ *  #include <thrust/iterator/zip_iterator.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/transform.h>
+ *  #include <thrust/zip_function.h>
+ *
+ *  struct SumTuple {
+ *    float operator()(Tuple tup) {
+ *      return std::get<0>(tup) + std::get<1>(tup) + std::get<2>(tup);
+ *    }
+ *  };
+ *  struct SumArgs {
+ *    float operator()(float a, float b, float c) {
+ *      return a + b + c;
+ *    }
+ *  };
+ *
+ *  int main() {
+ *    thrust::device_vector<float> A(3);
+ *    thrust::device_vector<float> B(3);
+ *    thrust::device_vector<float> C(3);
+ *    thrust::device_vector<float> D(3);
+ *    A[0] = 0.f; A[1] = 1.f; A[2] = 2.f;
+ *    B[0] = 1.f; B[1] = 2.f; B[2] = 3.f;
+ *    C[0] = 2.f; C[1] = 3.f; C[2] = 4.f;
+ *
+ *    // The following four invocations of transform are equivalent
+ *    // Transform with 3-tuple
+ *    thrust::transform(thrust::make_zip_iterator(thrust::make_tuple(A.begin(), B.begin(), C.begin())),
+ *                      thrust::make_zip_iterator(thrust::make_tuple(A.end(), B.end(), C.end())),
+ *                      D.begin(),
+ *                      SumTuple{});
+ *
+ *    // Transform with 3 parameters
+ *    thrust::zip_function<SumArgs> adapted{};
+ *    thrust::transform(thrust::make_zip_iterator(thrust::make_tuple(A.begin(), B.begin(), C.begin())),
+ *                      thrust::make_zip_iterator(thrust::make_tuple(A.end(), B.end(), C.end())),
+ *                      D.begin(),
+ *                      adapted);
+ *
+ *    // Transform with 3 parameters with convenience function
+ *    thrust::zip_function<SumArgs> adapted{};
+ *    thrust::transform(thrust::make_zip_iterator(thrust::make_tuple(A.begin(), B.begin(), C.begin())),
+ *                      thrust::make_zip_iterator(thrust::make_tuple(A.end(), B.end(), C.end())),
+ *                      D.begin(),
+ *                      thrust::make_zip_function(SumArgs{}));
+ *
+ *    // Transform with 3 parameters with convenience function and lambda
+ *    thrust::zip_function<SumArgs> adapted{};
+ *    thrust::transform(thrust::make_zip_iterator(thrust::make_tuple(A.begin(), B.begin(), C.begin())),
+ *                      thrust::make_zip_iterator(thrust::make_tuple(A.end(), B.end(), C.end())),
+ *                      D.begin(),
+ *                      thrust::make_zip_function([] (float a, float b, float c) {
+ *                                                  return a + b + c;
+ *                                                }));
+ *    return 0;
+ *  }
+ *  \endcode
+ *
+ *  \see make_zip_function
+ *  \see zip_iterator
+ */
+template <typename Function>
+class zip_function
+{
+  public:
+     __host__ __device__
+    zip_function(Function func) : func(std::move(func)) {}
+
+// Add workaround for decltype(auto) on C++11-only compilers:
+#if THRUST_CPP_DIALECT >= 2014
+
+    template <typename Tuple>
+    __host__ __device__
+    decltype(auto) operator()(Tuple&& args) const
+    {
+        return detail::zip_detail::apply(func, THRUST_FWD(args));
+    }
+
+#else // THRUST_CPP_DIALECT
+
+    // Can't just use THRUST_DECLTYPE_RETURNS here since we need to use
+    // std::declval for the signature components:
+    template <typename Tuple>
+    __host__ __device__
+    auto operator()(Tuple&& args) const
+    noexcept(noexcept(detail::zip_detail::apply(std::declval<Function>(), THRUST_FWD(args))))
+    THRUST_TRAILING_RETURN(decltype(detail::zip_detail::apply(std::declval<Function>(), THRUST_FWD(args))))
+    {
+        return detail::zip_detail::apply(func, THRUST_FWD(args));
+    }
+
+#endif // THRUST_CPP_DIALECT
+
+  private:
+    mutable Function func;
+};
+
+/*! \p make_zip_function creates a \p zip_function from a function object.
+ *
+ *  \param fun The N-ary function object.
+ *  \return A \p zip_function that takes a N-tuple.
+ *
+ *  \see zip_function
+ */
+template <typename Function>
+__host__ __device__
+zip_function<typename std::decay<Function>::type>
+make_zip_function(Function&& fun)
+{
+    using func_t = typename std::decay<Function>::type;
+    return zip_function<func_t>(THRUST_FWD(fun));
+}
+
+/*! \} // end function_object_adaptors
+ */
+
+/*! \} // end function_objects
+ */
+
+THRUST_NAMESPACE_END
+
+#endif
diff --git a/thrust_perf_tests.trs b/thrust_perf_tests.trs
deleted file mode 100644
index d4d76e8f7..000000000
--- a/thrust_perf_tests.trs
+++ /dev/null
@@ -1,37 +0,0 @@
-{
-  # Descriptive name for the testsuite (required).
-  "name"        : "Thrust Performance Testsuite",
-  "version"     : "2",
-  "extrapath"   : [ "{TR_INSTALL_DIR}/bin/{TR_ARCH_PATH}" ],
-  # Testsuite owner's email (required).
-  "owner"       : "blelbach@nvidia.com",
-  # Define paths containing shared libraries required by the tests. Use envvar VULCAN_SHAREDLIB_DIR to refer 
-  # to the platform specific portion of the path (e.g. bin/ for windows, lib64/ for 64-bit
-  # Linux, etc.)
-  "librarypath" : [ "{TR_INSTALL_DIR}\/cuda\/_internal\/driver" ],
-  # Default working directory for test runs (optional).
-  "cwd"        : "{TR_INSTALL_DIR}/bin/{TR_ARCH_PATH}",
-  # Timeout for entire testsuite, in seconds (optional).
-  "timeout"     : "3600",
-  # Default timeout for individual tests, in seconds (optional).
-  "testtimeout" : "3600",
-  # The tests in the testsuite (required).
-  "tests" : [
-      {
-        "exe" : "{PYTHON} {TR_INSTALL_DIR}/opencl/_tests/GPUConfMgr/GPUConfMgr.py -caseName=set -clock=P0MAX -VULCAN_INSTALL={TR_INSTALL_DIR}",
-        "attributes" : [ ]
-      },
-      {
-        "exe": "{PYTHON} {TR_TESTSUITE_DIR}/internal/scripts/eris_perf.py -b {TR_INSTALL_DIR}/bin/{TR_ARCH_PATH}/bench -p {TR_INSTALL_DIR}/thrust/internal/benchmark/combine_benchmark_results.py",
-        "attributes": [ "result=multi" ]
-      },
-      {
-        "exe" : "{PYTHON} {TR_INSTALL_DIR}/opencl/_tests/GPUConfMgr/GPUConfMgr.py -caseName=reset -clock=P0MAX -VULCAN_INSTALL={TR_INSTALL_DIR}",
-        "attributes" : [ ]
-      }
- ]
-}
-
-# File /home/rjray/Perforce/general/gpgpu/thrust/thrust_perf_tests.trs
-# Converted from /home/rjray/Perforce/general/gpgpu/thrust/thrust_perf_tests.vlct
-# Converted by tr_configtool.pl/0.4, on Fri Oct  6 13:07:44 2017
diff --git a/thrust_perf_tests.vlcc b/thrust_perf_tests.vlcc
deleted file mode 100644
index d02bf9e68..000000000
--- a/thrust_perf_tests.vlcc
+++ /dev/null
@@ -1,38 +0,0 @@
-# Thrust performance tests component configuration. 
-{ 
-  # Descriptive name for the component
-  "name"      : "Thrust Performance Test Suite",
-  "type"      : "performance",
-  # Component owner (email address)
-  "owner"     : "blelbach@nvidia.com",
-  "module"    : "CUDA - Thrust",
-
-  # Build timeout (in seconds).
-  "buildtimeout" : "600",
-  # Define variables usable in this component
-  "env"       : [ "THRUST_TESTS_BIN_DIR=${VULCAN_BUILD_DIR}/bin/${VULCAN_ARCH}_${VULCAN_OS}${VULCAN_ABI}_${VULCAN_BUILD}" ],
-  # Files included in this component specified with one or more paths. 
-  # Paths are relative to this file location. Path syntax uses wildcards and p4-like '...'.
-  "files"     : [
-                  "...",
-                  { "include" : "CUDA_TOOLKIT_BUILD_FILES" }
-                ],
-  # Output produced by this component and the installation location
-  # for each output. The install location is relative to
-  # VULCAN_INSTALL_DIR. Optional 'kind' property can be used to tag
-  # artifact kinds.
-  "artifacts" : [
-                  { "${THRUST_TESTS_BIN_DIR}/bench" : "cuda/_tests/thrust_perf_tests/.", "kind": "EXE" },
-                  { "internal/benchmark/combine_benchmark_results.py" : "cuda/_tests/thrust_perf_tests/." },
-                  { "internal/scripts/eris_perf.py" : "cuda/_tests/thrust_perf_tests/." },
-                  { "thrust_perf_tests.vlct"        : "cuda/_tests/thrust_perf_tests/.", "kind": "TESTSUITE" }
-                ],
-  # Dependencies for this component.
-  "depends"   : [ "driver_headers", "driver", "compiler", "cuda", "thrust", "GPUConfMgr" ],
-  # The agent for this component, relative to this file location. The
-  # agent is invoked to perform component actions.
-  "agent"     : {
-                  "CUDA_MAKE" : "Makefile", "build_target" : "all", "clean_target" : "clean",
-                  "args" : [ "TEST_BENCH=1" ]
-                }
-}
diff --git a/thrust_perf_tests.vlct b/thrust_perf_tests.vlct
deleted file mode 100644
index 28c414426..000000000
--- a/thrust_perf_tests.vlct
+++ /dev/null
@@ -1,33 +0,0 @@
-# Thrust performance tests component configuration. 
-{
-  # Descriptive name for the testsuite (required).
-  "name"        : "Thrust Performance Testsuite",
-  # Testsuite owner's email (required).
-  "owner"       : "blelbach@nvidia.com",
-
-  # Define paths containing shared libraries required by the tests. Use envvar VULCAN_SHAREDLIB_DIR to refer 
-  # to the platform specific portion of the path (e.g. bin/ for windows, lib64/ for 64-bit
-  # Linux, etc.)
-  "dllpath"     : [ "${VULCAN_INSTALL_DIR}/cuda/_internal/driver" ],
-  # Default working directory for test runs (optional).
-  "cwd"         : "${VULCAN_TESTSUITE_DIR}",
-  # Timeout for entire testsuite, in seconds (optional).
-  "timeout"     : "3600",
-  # Default timeout for individual tests, in seconds (optional).
-  "testtimeout" : "3600",
-  # The tests in the testsuite (required).
-  "tests" : [
-      {
-        "exe" : "${PYTHON} ${VULCAN_INSTALL_DIR}/opencl/_tests/GPUConfMgr/GPUConfMgr.py -caseName=set -clock=P0MAX -VULCAN_INSTALL=${VULCAN_INSTALL_DIR}",
-        "attributes" : [ ]
-      },
-      {
-        "exe": "${PYTHON} eris_perf.py",
-        "attributes": [ "result=multi" ]
-      },
-      {
-        "exe" : "${PYTHON} ${VULCAN_INSTALL_DIR}/opencl/_tests/GPUConfMgr/GPUConfMgr.py -caseName=reset -clock=P0MAX -VULCAN_INSTALL=${VULCAN_INSTALL_DIR}",
-        "attributes" : [ ]
-      }
- ]
-}
diff --git a/thrust_tests.trs b/thrust_tests.trs
deleted file mode 100644
index de276a86a..000000000
--- a/thrust_tests.trs
+++ /dev/null
@@ -1,31 +0,0 @@
-{ 
-  # Descriptive name for the component
-  "name"        : "Thrust Test Suite",
-  "version"     : "2",
-  # Component owner (email address)
-  "owner"       : "blelbach@nvidia.com",
-
-  "extrapath"   : [ "{TR_INSTALL_DIR}/bin/{TR_ARCH_PATH}" ],
-  # Define paths containing shared libraries required by the tests. Use envvar TR_SHAREDLIB_DIR to refer 
-  # to the platform specific portion of the path (e.g. bin/ for windows, lib64/ for 64-bit
-  # Linux, etc.)
-  "librarypath" : [ "{TR_INSTALL_DIR}/bin/{TR_ARCH_PATH}", "{TR_INSTALL_DIR}\/cuda\/_internal\/driver" ],
-  # Default working directory for test runs (optional). The directory can be a an absolute
-  # or relative path. A relative path is relative to this file's location. Variables can
-  # be used in the path using the {var} syntax.
-  "cwd"         : "{TR_INSTALL_DIR}/bin/{TR_ARCH_PATH}",
-  # Timeout for entire testsuite, in seconds (optional). If not timeout is specified the
-  # default timeout value of 900 seconds will be used.
-  "timeout"     : "12000",
-  # Default timeout for individual tests, in seconds (optional).
-  "testtimeout" : "900",
-  # The tests in the testsuite (required).
-  "tests"       : [
-    
-    {
-      "exe" : "{PERL} {TR_TESTSUITE_DIR}/internal/test/thrust_nightly.pl -bin-path={TR_INSTALL_DIR}/bin/{TR_ARCH_PATH} -filecheck-data-path={TR_TESTSUITE_DIR}/internal/test -filecheck-path={TR_INSTALL_DIR}/bin/{TR_ARCH_PATH}/nvvm/tools", 
-      "attributes": [ "result=multi" ]
-    }
-
-  ]
-}
diff --git a/thrust_tests.vlcc b/thrust_tests.vlcc
deleted file mode 100644
index 32ca412fa..000000000
--- a/thrust_tests.vlcc
+++ /dev/null
@@ -1,36 +0,0 @@
-{ 
-  # Descriptive name for the component
-  "name"      : "Thrust Test Suite",
-  # Component owner (email address)
-  "owner"     : "blelbach@nvidia.com",
-  "module"    : "CUDA - Thrust",
-
-  # Build timeout (in seconds).
-  "buildtimeout" : "28800",
-  # Define variables usable in this component
-  "env"       : [ "THRUST_TESTS_BIN_DIR=${VULCAN_BUILD_DIR}/bin/${VULCAN_ARCH}_${VULCAN_OS}${VULCAN_ABI}_${VULCAN_BUILD}" ],
-  # Files included in this component specified with one or more paths. 
-  # Paths are relative to this file location. Path syntax uses wildcards and p4-like '...'.
-  "files"     : [
-                  "...",
-                  { "include" : "CUDA_TOOLKIT_BUILD_FILES" }
-                ],
-  # Output produced by this component and the installation location
-  # for each output. The install location is relative to
-  # VULCAN_INSTALL_DIR. Optional 'kind' property can be used to tag
-  # artifact kinds.
-  "artifacts" : [
-                  { "${THRUST_TESTS_BIN_DIR}/*"                    : "cuda/_tests/thrust_tests/." },
-                  { "internal/test/thrust_nightly.pl"              : "cuda/_tests/thrust_tests/." },
-                  { "internal/test/*.filecheck"                    : "cuda/_tests/thrust_tests/filecheck_data/." },
-                  { "thrust_tests.vlct"                            : "cuda/_tests/thrust_tests/.", "kind" : "TESTSUITE" }
-                ],
-  # Dependencies for this component.
-  "depends"   : [ "driver_headers", "driver", "compiler", "cuda", "thrust" ],
-  # The agent for this component, relative to this file location. The
-  # agent is invoked to perform component actions.
-  "agent"     : {
-                  "CUDA_MAKE" : "Makefile", "build_target" : "all", "clean_target" : "clean",
-                  "args" : [ "TEST_ALL=1" ]
-                }
-}
diff --git a/thrust_tests.vlct b/thrust_tests.vlct
deleted file mode 100644
index 9ecd7d521..000000000
--- a/thrust_tests.vlct
+++ /dev/null
@@ -1,31 +0,0 @@
-{
-  # Descriptive name for the testsuite (required).
-  "name"        : "Thrust Test Suite",
-  # Testsuite owner's email (required).
-  "owner"       : "blelbach@nvidia.com",
-
-  # Define paths containing shared libraries required by the tests. Use envvar VULCAN_SHAREDLIB_DIR to refer 
-  # to the platform specific portion of the path (e.g. bin/ for windows, lib64/ for 64-bit
-  # Linux, etc.)
-  "dllpath"     : [ "${VULCAN_INSTALL_DIR}/cuda/${INSTALL_TARGET_DIR}/${SHAREDLIB_DIR}",
-                    "${VULCAN_INSTALL_DIR}/cuda/_internal/driver"
-                  ],
-  # Default working directory for test runs (optional). The directory can be a an absolute
-  # or relative path. A relative path is relative to this file's location. Variables can
-  # be used in the path using the ${var} syntax.
-  "cwd"         : "${VULCAN_TESTSUITE_DIR}",
-  # Timeout for entire testsuite, in seconds (optional). If not timeout is specified the
-  # default timeout value of 900 seconds will be used.
-  "timeout"     : "12000",
-  # Default timeout for individual tests, in seconds (optional).
-  "testtimeout" : "5400",
-  # The tests in the testsuite (required).
-  "tests"       : [
-    
-    {
-      "exe" : "${PERL} thrust_nightly.pl -bin-path=${VULCAN_TESTSUITE_DIR} -filecheck-data-path=${VULCAN_TESTSUITE_DIR}/filecheck_data -filecheck-path=${VULCAN_INSTALL_DIR}/cuda/_internal/compiler/nvvm/tools",
-      "attributes" : [ "result=multi" ]
-    }
-    
-  ]
-}